This module builds on code contained in Coronavirus_Statistics_USAF_v003.Rmd. This file includes the latest code for analyzing data from USA Facts. USA Facts maintains data on cases and deaths by county for coronavirus in the US. Downloaded data are unique by county with date as a column and a separate file for each of cases, deaths, and population.
The intent of this module is to rebuild the function readRunUSAFacts() so that it works with 2021 data as currently formatted by USA Facts. The code will then be included in appropriate .R modules that can be sourced.
The tidyverse library is loaded, and the functions used for CDC daily processing are sourced:
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.1 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Functions are available in source file
source("./Generic_Added_Utility_Functions_202105_v001.R")
source("./Coronavirus_CDC_Daily_Functions_v001.R")
The current readRunUSAFacts() function is copied:
# Function to run the USA Facts (US county-level coronavirus data) clustering process
readRunUSAFacts <- function(maxDate,
popLoc,
caseLoc,
deathLoc,
dlPop=FALSE,
dlCaseDeath=FALSE,
ovrWrite=FALSE,
ovrWriteError=TRUE,
oldFile=NULL,
showBurdenMinPop=10000,
minPopCluster=25000,
existingStateClusters=NULL,
existingCountyClusters=NULL,
createClusters=FALSE,
hierarchical=FALSE,
kCut=6,
orderCluster=TRUE,
...
) {
# FUNCTION ARGUMENTS:
# maxDate: the maximum data to use for data from the cases and deaths file
# popLoc: location where the county-level population data are stored
# caseLoc: location where the county-level cases data are stored
# deathLoc: location where the county-level deaths data are stored
# dlPop: boolean, should new population data be downloaded to popLoc
# dlCaseDeath: boolean, should new case data and death data be downloaded to caseLoc and deathLoc
# ovrWrite: boolean, if data are downloaded to an existing file, should it be over-written
# ovrWriteError: boolean, if ovrWrite is FALSE and an attempt to overwrite is made, should it error out?
# oldFile: old file for comparing metrics against (NULL means no old file for comarisons)
# showBurdenMinPop: minimum population for showing in burden by cluster plots (NULL means skip plot)
# minPopCluster: minimum population for including county in running cluster-level metrics
# existingStateClusters: location of an existing named vector with clusters by state (NULL means none)
# existingCountyClusters: location of an existing named vector with clusters by county (NULL means none)
# if existingStateClusters is not NULL, then existingCountyClusters is ignored
# createClusters: boolean, whether to create new clusters (only set up for kmeans)
# hierarchical: whether to create hierarchical clusters
# TRUE means run hierarchical clustering
# FALSE means run kmeans clustering
# NA means run rules-based clustering
# kCut; if hierarchical clustering is used, what k (number of clusters in cutree) should be used?
# orderCluster: if FALSE, ignore; if TRUE, order by "dpm"; if anything else, order by orderCluster
# ...: other arguments that will be passed to prepClusterCounties
# STEP 0: Download new files (if requested)
urlCase <- "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_confirmed_usafacts.csv"
urlDeath <- "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_deaths_usafacts.csv"
urlPop <- "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/covid_county_population_usafacts.csv"
# Helper function to download a file
helperDownload <- function(url, loc, ovrWrite=ovrWrite, ovrWriteError=ovrWriteError) {
# If the file exists, mention it and proceed as per the guard checks
if (file.exists(loc)) {
cat("\nFile:", loc, "already exists\n")
if (!ovrWrite & ovrWriteError) stop("\nExiting due to ovrWrite=FALSE and ovrWriteError=TRUE\n")
if (!ovrWrite & !ovrWriteError) {
cat("\nFile is NOT downloaded again\n")
return(NULL)
}
}
# Download the file and change to read-only
download.file(url, destfile=loc, method="curl")
Sys.chmod(loc, mode="0555", use_umask = FALSE)
}
if (dlPop) helperDownload(urlPop, loc=popLoc)
if (dlCaseDeath) helperDownload(urlCase, loc=caseLoc)
if (dlCaseDeath) helperDownload(urlDeath, loc=deathLoc)
# STEP 1: Read in the population file
pop <- readr::read_csv(popLoc) %>%
rename(countyName=`County Name`, state=State)
# STEP 2: Read case and death data, combine, and add population totals and existing clusters
burdenData <- readUSAFacts(
caseFile=caseLoc,
deathFile=deathLoc,
countyPopFile=pop,
oldFile=oldFile,
showBurdenMinPop=showBurdenMinPop,
maxDate=maxDate,
stateClusters=existingStateClusters,
countyClusters=existingCountyClusters,
glimpseRaw=FALSE
)
# STEP 3: Create appropriately filtered data, and new clusters if requested
clusterData <- prepClusterCounties(burdenFile=burdenData,
maxDate=maxDate,
minPop=minPopCluster,
createClusters=createClusters,
hierarchical=hierarchical,
returnList=TRUE,
...
)
# STEP 4: Assess clusters against the new data
# STEP 4a: Extract the county-level clusters (new clusters if created, existing otherwise)
if (createClusters) {
if (is.na(hierarchical)) clustVec <- clusterData$objCluster$objCluster
else if (hierarchical) clustVec <- cutree(clusterData$objCluster$objCluster, k=kCut)
else clustVec <- clusterData$objCluster$objCluster$cluster
}
else {
clustVec <- existingCountyClusters
}
# STEP 4b: Show the cumulative data, order by cluster, and keep the plots together
helperACC_county <- helperAssessCountyClusters(vecCluster=clustVec,
dfPop=clusterData$countyFiltered,
dfBurden=clusterData$countyFiltered,
showCum=TRUE,
thruLabel=format(as.Date(maxDate), "%b %d, %Y"),
plotsTogether=TRUE,
orderCluster=orderCluster
)
# STEP 5: Add back clusters not used for analysis (code 999) and associated disease data
# May want to change the approach to population data
clusterStateData <- helperMakeClusterStateData(dfPlot=helperACC_county,
dfPop=usmap::countypop,
dfBurden=clusterData$countyDailyPerCapita,
orderCluster=orderCluster
)
# STEP 6: Return a list of the key files
list(pop=pop,
burdenData=burdenData,
clusterData=clusterData,
clustVec=clustVec,
helperACC_county=helperACC_county,
clusterStateData=clusterStateData,
maxDate=maxDate
)
}
The function is updated to make better use of the functional form. Broadly, the process will include:
The function includes:
# Function to get county-level population data
getCountyData <- function(df=readFromRDS("countyPop2021"),
renameVars=c("State"="state", "County Name"="countyName", "population"="pop"),
keepVars=c("countyFIPS", "countyName", "state", "pop"),
selfList=list(),
fullList=list(),
lstFilter=list(),
lstExclude=list()
) {
# FUNCTION ARGUMENTS:
# df: the data frame containing state data
# renameVars: variables to be renamed, using named list with format "originalName"="newName"
# keepVars: variables to be kept in the final file (NULL means keep all)
# selfList: list for functions to apply to self, list('variable'=fn) will apply variable=fn(variable)
# processed in order, so more than one function can be applied to self
# fullList: list for general functions to be applied, list('new variable'=expression(code))
# will create 'new variable' as eval(expression(code))
# for now, requires passing an expression
# lstFilter: a list for filtering records, of form list("field"=c("allowed values"))
# lstExclude: a list for filtering records, of form list("field"=c("disallowed values"))
# Read the file, rename and keep variables, apply functions as appropriate
df %>%
colRenamer(vecRename=renameVars) %>%
colSelector(vecSelect=keepVars) %>%
colMutater(selfList=selfList, fullList=fullList) %>%
rowFilter(lstFilter=lstFilter, lstExclude=lstExclude)
}
# Mapping list for combining data elements from raw files
# Mapping for urlType to url
usafMainURL <- "https://usafactsstatic.blob.core.windows.net/public/data/covid-19/"
urlMapper <- c("cdcDaily"="https://data.cdc.gov/api/views/9mfq-cb36/rows.csv?accessType=DOWNLOAD",
"cdcHosp"="https://beta.healthdata.gov/api/views/g62h-syeh/rows.csv?accessType=DOWNLOAD",
"usafCase"=paste0(usafMainURL, "covid_confirmed_usafacts.csv"),
"usafDeath"=paste0(usafMainURL, "covid_deaths_usafacts.csv"),
"usafPop"=paste0(usafMainURL, "covid_county_population_usafacts.csv")
)
# Mapping for urlType to colRenamer(vecRename=...)
renMapper <- list("cdcDaily"=c('submission_date'='date', 'new_case'='new_cases',
'tot_death'='tot_deaths', 'new_death'='new_deaths'
),
"cdcHosp"=c("inpatient_beds_used_covid"="inp",
"total_adult_patients_hospitalized_confirmed_and_suspected_covid"="hosp_adult",
"total_pediatric_patients_hospitalized_confirmed_and_suspected_covid"="hosp_ped"
),
"usafCase"=c("County Name"="countyName", "State"="state"),
"usafDeath"=c("County Name"="countyName", "State"="state"),
"default"=c()
)
# Function for zero-padding a character string
zeroPad <- function(x, width, side="left", pad="0", convChar=TRUE) {
stringr::str_pad(if(convChar) as.character(x) else x, width=width, side=side, pad=pad)
}
zeroPad5 <- function(x, ...) zeroPad(x, width=5, ...)
zeroPad2 <- function(x, ...) zeroPad(x, width=2, ...)
# Mapping for urlType to colMutater(selfList=...)
selfListMapper <- list("cdcDaily"=list('date'=lubridate::mdy),
"cdcHosp"=list(),
"usafCase"=list('countyFIPS'=zeroPad5, 'stateFIPS'=zeroPad2),
"usafDeath"=list('countyFIPS'=zeroPad5, 'stateFIPS'=zeroPad2),
"default"=list()
)
# Mapping for urlType to colMutater(fullList=...)
fullListMapper <- list("cdcDaily"=list(),
"cdcHosp"=list(),
"usafCase"=list(),
"usafDeath"=list(),
"default"=list()
)
# Mapping for urlType to pivotData(pivotBy=...)
pivotMapper <- list("usafCase"=c("countyFIPS", "countyName", "state", "stateFIPS"),
"usafDeath"=c("countyFIPS", "countyName", "state", "stateFIPS")
)
# Mapping for urlType to pivotted variable
rawMakeVarMapper <- list("usafCase"=c("cases"),
"usafDeath"=c("deaths")
)
# Mapping for urlType to checkUniqueRows(uniqueBy=...)
uqMapper <- list("cdcDaily"=c("state", "date"),
"cdcHosp"=c("state", "date"),
"usafCase"=c("countyFIPS", "stateFIPS", "date"),
"usafDeath"=c("countyFIPS", "stateFIPS", "date")
)
# Mapping list for rows to be filtered (typically, states to be kept)
# Formatted as named list per urlType, with name being the field and element being the allowed values
lstFilterMapper <- list("cdcDaily"=list("state"=c(state.abb, "DC")),
"cdcHosp"=list("state"=c(state.abb, "DC")),
"usafCase"=list(),
"usafDeath"=list()
)
# Mapping list for rows to be filtered out (typically, unallocated counties to be deleted)
# Formatted as named list per urlType, with name being the field and element being the disallowed values
lstExcludeMapper <- list("cdcDaily"=list(),
"cdcHosp"=list(),
"usafCase"=list("countyFIPS"=c("00000", "00001")),
"usafDeath"=list("countyFIPS"=c("00000", "00001"))
)
# Mapping list for vector selection in processed data
# Formatted as a named list where the names are urlType and the values are fields to be kept
vecSelectMapper <- list("cdcDaily"=c("date", "state", "tot_cases", "tot_deaths", "new_cases", "new_deaths"),
"cdcHosp"=c("date", "state", "inp", "hosp_adult", "hosp_ped"),
"usafCase"=c("countyFIPS", "state", "date", "cases", "new_cases"),
"usafDeath"=c("countyFIPS", "state", "date", "deaths", "new_deaths")
)
# Mapping file for group_by variable per urlType
checkControlGroupMapper <- list("cdcDaily"="date",
"cdcHosp"="date",
"usafDeath"="date",
"usafCase"="date",
"default"=c()
)
# Mapping file for numerics to summarize by group_by variable per urlType
checkControlVarsMapper <- list("cdcDaily"=c("new_cases", "new_deaths"),
"cdcHosp"=c("inp", "hosp_adult", "hosp_ped"),
"usafDeath"=c("deaths", "new_deaths"),
"usafCase"=c("cases", "new_cases")
)
# Mapping for urlType to checkSimilarity(..., keyVars=); universe similarity checks to perform and report
checkSimilarityMapper <- list("cdcDaily"=list(date=list(label='date', countOnly=TRUE, convChar=TRUE),
state=list(label='state', countOnly=FALSE)
),
"cdcHosp"=list(date=list(label='date', countOnly=TRUE, convChar=TRUE),
state=list(label='state', countOnly=FALSE)
),
"usafCase"=list(date=list(label='date', countOnly=TRUE, convChar=TRUE),
countyFIPS=list(label='county', countOnly=FALSE)
),
"usafDeath"=list(date=list(label='date', countOnly=TRUE, convChar=TRUE),
countyFIPS=list(label='county', countOnly=FALSE)
),
"default"=list()
)
# Mapping for urlType to plotSimilarity(..., ); fields where change in universe should be reported
plotSimilarityMapper <- list("cdcDaily"=c("date"),
"cdcHosp"=c("date"),
"usafCase"=c("date"),
"usafDeath"=c("date"),
"default"=c()
)
# Mapping file for aggregated control total checks to perform
# Formatted as one list per urlType
# Within each urlType list, sublists drive the grouping variable, numerical aggregates, and reporting
keyAggMapper <- list("cdcDaily"=list("l1"=list("grpVar"="date",
"numVars"=c("new_cases", "new_deaths",
"tot_cases", "tot_deaths"
),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=TRUE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.05,
"absTol"=5,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
),
"l2"=list("grpVar"="state",
"numVars"=c("new_cases", "new_deaths"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=FALSE,
"flagLargeDelta"=FALSE
),
"l3"=list("grpVar"="state",
"numVars"=c("new_cases", "new_deaths",
"tot_cases", "tot_deaths"
),
"sameUniverse"="date",
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.001,
"absTol"=0,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
)
),
"cdcHosp"=list("l1"=list("grpVar"="date",
"numVars"=c("inp", "hosp_adult", "hosp_ped"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=TRUE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.05,
"absTol"=5,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
),
"l2"=list("grpVar"="state",
"numVars"=c("inp", "hosp_adult", "hosp_ped"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=FALSE,
"flagLargeDelta"=FALSE
),
"l3"=list("grpVar"="state",
"numVars"=c("inp", "hosp_adult", "hosp_ped"),
"sameUniverse"="date",
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.001,
"absTol"=0,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
)
),
"usafDeath"=list("l1"=list("grpVar"="date",
"numVars"=c("deaths", "new_deaths"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=TRUE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.05,
"absTol"=5,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
),
"l2"=list("grpVar"="state",
"numVars"=c("deaths", "new_deaths"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=FALSE,
"flagLargeDelta"=FALSE
),
"l3"=list("grpVar"=c("countyFIPS", "countyName", "state"),
"numVars"=c("deaths", "new_deaths"),
"sameUniverse"="date",
"plotData"=FALSE,
"isLine"=FALSE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.001,
"absTol"=0,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
)
),
"usafCase"=list("l1"=list("grpVar"="date",
"numVars"=c("cases", "new_cases"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=TRUE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.05,
"absTol"=5,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
),
"l2"=list("grpVar"="state",
"numVars"=c("cases", "new_cases"),
"sameUniverse"=NA,
"plotData"=TRUE,
"isLine"=FALSE,
"returnDelta"=FALSE,
"flagLargeDelta"=FALSE
),
"l3"=list("grpVar"=c("countyFIPS", "countyName", "state"),
"numVars"=c("cases", "new_cases"),
"sameUniverse"="date",
"plotData"=FALSE,
"isLine"=FALSE,
"returnDelta"=TRUE,
"flagLargeDelta"=TRUE,
"pctTol"=0.001,
"absTol"=0,
"sortBy"=c("name", "pctDelta", "absDelta"),
"dropNA"=TRUE,
"printAll"=TRUE
)
)
)
# Formatted as one list per urlType, with that list having one list for each combination of data
lstComboMapper <- list("cdcDaily"=list("nyc"=list("comboVar"="state",
"uqVars"="date",
"vecCombo"=c("NY"="NY", "NYC"="NY"),
"fn"=specNA(sum)
)
),
"cdcHosp"=list()
)
# Mapping file for creating per-capita metrics
# Formatted as c('raw variable name'='associated per capita variable name')
perCapMapper <- c("tot_cases"="tcpm",
"tot_deaths"="tdpm",
"cases"="tcpm",
"deaths"="tdpm",
"new_cases"="cpm",
"new_deaths"="dpm",
"inp"="hpm",
"hosp_adult"="ahpm",
"hosp_ped"="phpm"
)
plotCombineAggByMapper <- list("state"=list(agg1 = list(aggFunc = specNA(specSumProd),
aggVars = c("pop"),
wtVar = NULL,
prefix = NULL
),
agg2 = list(aggFunc = specNA(weighted.mean),
aggVars = c("tcpm7", "tdpm7", "cpm7", "dpm7", "hpm7"),
wtVar = "pop",
prefix = "wm_"
)
),
"county"=list("agg1"=list(aggFunc=specNA(specSumProd),
aggVars=c("pop"),
wtVar=NULL,
prefix=NULL
),
"agg2"=list(aggFunc=specNA(weighted.mean),
aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7"),
wtVar="pop",
prefix="wm_"
)
)
)
# Function to pivot the data file longer
pivotData <- function(df,
pivotKeys,
nameVar="name",
valVar="value",
toLonger=TRUE,
...
) {
# FUNCTION ARGUMENTS:
# df: the data frame
# pivotKeys: the keys (everything but cols for pivot_longer, id_cols for pivot_wider)
# nameVar: variable name for names_to or names_from
# valVar: variable name for values_to or values_from
# toLonger: boolean, should pivot_longer() be used rather than pivot_wider()?
# ...: other arguments to be passed to pivot_*()
if (isTRUE(toLonger)) pivot_longer(df, -all_of(pivotKeys), names_to=nameVar, values_to=valVar, ...)
else pivot_wider(df, all_of(pivotKeys), names_from=all_of(nameVar), values_from=all_of(valVar), ...)
}
# Function to read and QC raw USA Facts data
readQCRawUSAF <- function(fileName,
writeLog=NULL,
ovrwriteLog=TRUE,
dfRef=NULL,
urlType=NULL,
url=NULL,
getData=TRUE,
ovrWriteDownload=FALSE,
vecRename=NULL,
selfList=NULL,
fullList=NULL,
uniqueBy=NULL,
pivotBy=NULL,
rawMakeVar=NULL,
step3Group=NULL,
step3Vals=NULL,
step4KeyVars=NULL,
step5PlotItems=NULL,
step6AggregateList=NULL,
inferVars=list("url"=urlMapper,
"vecRename"=renMapper,
"selfList"=selfListMapper,
"fullList"=fullListMapper,
"uniqueBy"=uqMapper,
"pivotBy"=pivotMapper,
"rawMakeVar"=rawMakeVarMapper,
"step3Group"=checkControlGroupMapper,
"step3Vals"=checkControlVarsMapper,
"step4KeyVars"=checkSimilarityMapper,
"step5PlotItems"=plotSimilarityMapper,
"step6AggregateList"=keyAggMapper
)
) {
# FUNCtiON ARGUMENTS:
# fileName: the location where downloaded data either is, or will be, stored
# writeLog: the external file location for printing (NULL means use the main log stdout)
# ovrwriteLog: boolean, if using an external log, should it be started from scratch (overwritten)?
# dfRef: a reference data frame for comparison (either NULL or NA means do not run comparisons)
# urlType: character vector that can be mapped using urlMapper and keyVarMapper
# url: direct URL passed as character string
# NOTE that if both url and urlType are NULL, no file will be downloaded
# getData: boolean, should an attempt be made to get new data using urlType or url?
# ovrWriteDownload: boolean, if fileName already exists, should it be overwritten?
# vecRename: vector for renaming c('existing name'='new name'), can be any length from 0 to ncol(df)
# NULL means infer from urlType, if not available there use c()
# selfList: list for functions to apply to self, list('variable'=fn) will apply variable=fn(variable)
# processed in order, so more than one function can be applied to self
# NULL means infer from urlType, if not available in mapping file use list()
# fullList: list for general functions to be applied, list('new variable'=expression(code))
# will create 'new variable' as eval(expression(code))
# for now, requires passing an expression
# NULL means infer from urlType, use list() if not in mapping file
# pivotBy: combination of variables that should NOT be pivoted
# uniqueBy: combination of variables for checking uniqueness of pivoted file
# NULL means infer from data, keep as NULL (meaning use-all) if cannot be inferred
# rawMakeVar: variable name to be used for the numeric data pivoted down from columns
# NULL means infer from data, keep as NULL (meaning use-all) if cannot be inferred
# step3Group: variable to be used as the x-axis (grouping) for step 3 plots
# NULL means infer from data
# step3Vals: values to be plotted on the y-axis for step 3 plots
# NULL means infer from data
# step4KeyVars: list of parameters to be passed as keyVars= in step 4
# NULL means infer from urlType
# step5PlotItems: items to be plotted in step 5
# NULL means infer from urlType
# step6AggregateList: drives the elements to be passed to compareAggregate() and flagLargeDelta()
# NULL means infer from urlType
# inferVars: vector of c('variable'='mapper') for inferring parameter values when passed as NULL
# Step 0a: Use urlType to infer key variables if passed as NULL
for (vrbl in names(inferVars)) {
mapper <- inferVars[[vrbl]]
if (is.null(get(vrbl))) {
if (urlType %in% names(mapper)) assign(vrbl, mapper[[urlType]])
else if ("default" %in% names(mapper)) assign(vrbl, mapper[["default"]])
}
}
# Step 1: Download a new file (if requested)
if (!is.null(url) & isTRUE(getData)) fileDownload(fileName=fileName, url=url, ovrWrite=ovrWriteDownload)
else cat("\nNo file has been downloaded, will use existing file:", fileName, "\n")
# Step 2: Read file, rename and mutate variables, confirm uniqueness by expected levels
dfRaw <- fileRead(fileName) %>%
colRenamer(vecRename) %>%
colMutater(selfList=selfList, fullList=fullList) %>%
checkUniqueRows(uniqueBy=pivotBy) %>%
pivotData(pivotKeys=pivotBy, nameVar="date", valVar=rawMakeVar) %>%
colMutater(selfList=list("date"=lubridate::mdy)) %>%
checkUniqueRows(uniqueBy=uniqueBy) %>%
arrange(across(c(setdiff(uniqueBy, "date"), "date"))) %>%
group_by(across(setdiff(uniqueBy, "date"))) %>%
mutate(newBurden=ifelse(row_number()==1, get(rawMakeVar), get(rawMakeVar)-lag(get(rawMakeVar)))) %>%
ungroup() %>%
colRenamer(vecRename=c("newBurden"=paste0("new_", rawMakeVar)))
# Step 3: Plot basic control totals for new cases and new deaths by month
dfRaw %>%
checkControl(groupBy=step3Group, useVars=step3Vals, printControls=FALSE, na.rm=TRUE) %>%
helperLinePlot(x=step3Group, y="newValue", facetVar="name", facetScales="free_y", groupColor="name")
# If there is no file for comparison, return the data
if (is.null(dfRef) | (if(length(dfRef)==1) is.na(dfRef) else FALSE)) return(dfRaw)
# Step 4b: Check similarity of existing and reference file
# ovrWriteLog=FALSE since everything should be an append after the opening text line in step 0
diffRaw <- checkSimilarity(df=dfRaw,
ref=dfRef,
keyVars=step4KeyVars,
writeLog=writeLog,
ovrwriteLog=FALSE
)
# Step 5: Plot the similarity checks
plotSimilarity(diffRaw, plotItems=step5PlotItems)
# Step 6: Plot and report on differences in aggregates
helperAggMap <- function(x) {
h1 <- compareAggregate(df=dfRaw, ref=dfRef, grpVar=x$grpVar, numVars=x$numVars,
sameUniverse=x$sameUniverse, plotData=x$plotData, isLine=x$isLine,
returnDelta=x$returnDelta)
if (isTRUE(x$flagLargeDelta)) {
h2 <- flagLargeDelta(h1, pctTol=x$pctTol, absTol=x$absTol, sortBy=x$sortBy,
dropNA=x$dropNA, printAll=x$printAll
)
if (is.null(writeLog)) print(h2)
else {
cat(nrow(h2), " records", sep="")
txt <- paste0("\n\n***Differences of at least ",
x$absTol,
" and at least ",
round(100*x$pctTol, 3), "%\n\n"
)
printLog(h2, txt=txt, writeLog=writeLog)
}
}
}
lapply(step6AggregateList, FUN=helperAggMap)
cat("\n\n")
dfRaw
}
# Function to obtain county clusters and return the county clusters vector
getCountyClusters <- function(obj,
hierarchical=FALSE,
kCut=0,
reAssign=list(),
defaultCluster=NULL
) {
# FUNCTION ARGUMENTS
# obj: a clustering object returned by clusterCounties()
# hierarchical: whether the clustering object is based on hierarchical clusters
# TRUE means from hierarchical clustering
# FALSE means from kmeans clustering
# NA means from rules-based clustering
# kCut; if hierarchical clustering is used, what k (number of clusters in cutree) should be used?
# reAssign: mapping file to change segments, as list('entity'='other entity cluster to use')
# defaultCluster: cluster label to be assigned to any county that is not in obj$objCluster
# NULL means do not add these to the clustering vector
# Get the clusters from obj$objCluster
clust <- getClusters(obj$objCluster, hier=hierarchical, kCut=kCut, reAssign=reAssign)
# Add the defaultCluster label to any county that does not have a cluster label
if (!is.null(defaultCluster)) {
ctyAdd <- obj$countyBelow %>% pull(state) %>% unique() %>% sort()
vecAdd <- rep(defaultCluster, length(ctyAdd)) %>% purrr::set_names(ctyAdd)
clust <- c(clust, vecAdd)
}
# Return the cluster vector
clust
}
# Function to take county-level data, prepare for clusterStates, and return resulting outputs
clusterCounties <- function(dfPerCapita,
hierarchical,
vecRename=c(),
clusterBy=c("state"),
arrangeBy=c("date"),
burdenMetrics=c("cpm", "dpm"),
popVar=c("population"),
vecSelect=c(clusterBy, arrangeBy, burdenMetrics, popVar),
uniqueBy=c(clusterBy, arrangeBy),
minPopCluster=1,
returnList=TRUE,
...
) {
# FUNCTION ARGUMENTS:
# dfPerCapita: a county-level file with per-capita metrics
# hierarchical: whether to create hierarchical clusters
# TRUE means run hierarchical clustering
# FALSE means run kmeans clustering
# NA means run rules-based clustering
# vecRename: renaming of input variables
# clusterBy: the variable name used for clustering
# arrangeBy: data will be sorted by this a mix of clusterBy and this variable
# burdenMetrics: the metrics to be used for burden in clustering
# popVar: the column containing population data
# vecSelect: selection of input variables
# uniqueBy: the input file must be unique by, and will then be sorted by, uniqueBy
# minPopCluster: minimum population for including county in running cluster-level metrics
# returnList: boolean, if FALSE just the cluster object is returned
# if TRUE, a list is returned with dfCluster and the cluster object
# ...: other arguments that will be passed to clusterStates
# STEP 1: Select and rename variables from the dfPerCapita file
countyData <- dfPerCapita %>%
colRenamer(vecRename=vecRename) %>%
colSelector(vecSelect=vecSelect) %>%
checkUniqueRows(uniqueBy=uniqueBy, returnDF=TRUE) %>%
arrange(across(all_of(uniqueBy))) %>%
mutate(popThresh=(get(popVar)>=minPopCluster)) %>%
colMutater(selfList=c("state"=as.character))
# STEP 2: Split data based on population threshold
countyFiltered <- countyData %>% filter(popThresh)
countyBelow <- countyData %>% filter(!popThresh)
# STEP 2a: Confirm that no county is in both data sets
count(countyFiltered, a=get(clusterBy), popThresh) %>%
bind_rows(count(countyBelow, a=get(clusterBy), popThresh)) %>%
checkUniqueRows(uniqueBy=c("a"), returnDF=FALSE, noteUnique=FALSE)
# STEP 3: Run county-level clusters
objCluster <- clusterStates(countyFiltered, hierarchical=hierarchical, returnList=returnList, ...)
# Return all of the relevant objects
list(objCluster=objCluster,
countyFiltered=countyFiltered,
countyBelow=countyBelow
)
}
# Function to run the USA Facts (US county-level coronavirus data) clustering process
readRunUSAFacts <- function(maxDate,
downloadTo=list("usafCase"=NA, "usafDeath"=NA),
readFrom=downloadTo,
compareFile=list("usafCase"=NA, "usafDeath"=NA),
writeLog=NULL,
ovrWriteLog=TRUE,
dfPerCapita=NULL,
showBurdenMinPop=10000,
minPopCluster=25000,
defaultCluster=NULL,
existingStateClusters=NULL,
existingCountyClusters=NULL,
createClusters=FALSE,
hierarchical=FALSE,
kCut=6,
orderCluster=TRUE,
reAssignCounty=list(),
skipAssessmentPlots=FALSE,
brewPalette=NA,
...
) {
# FUNCTION ARGUMENTS:
# maxDate: the maximum data to use for data from the cases and deaths file
# downloadTo: named list for locations to download data (usafCase, usafDeath, usafPop)
# NA means do not download data for that particular element
# readFrom: named list for locations to read data from (defaults to donwloadTo)
# compareFile: named list for the reference file to be used for usafCase, usafDeath, usafPop
# NA means do not use a reference file for that element
# writeLog: name of a separate log file for capturing detailed data on changes between files
# NULL means no detailed data captured
# ovrwriteLog: boolean, should the log file be overwritten and started again from scratch?
# dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
# default NULL means create dfPerCapita using steps 2-4
# showBurdenMinPop: minimum population for showing in burden by cluster plots (NULL means skip plot)
# minPopCluster: minimum population for including county in running cluster-level metrics
# defaultCluster: cluster label to be assigned to any county that falls below minPopCluster
# NULL means do not add these to the clustering vector
# existingStateClusters: location of an existing named vector with clusters by state (NULL means none)
# existingCountyClusters: location of an existing named vector with clusters by county (NULL means none)
# if existingStateClusters is not NULL, then existingCountyClusters is ignored
# createClusters: boolean, whether to create new clusters (only set up for kmeans)
# hierarchical: whether to create hierarchical clusters
# TRUE means run hierarchical clustering
# FALSE means run kmeans clustering
# NA means run rules-based clustering
# kCut; if hierarchical clustering is used, what k (number of clusters in cutree) should be used?
# orderCluster: if FALSE, ignore; if TRUE, order by "dpm"; if anything else, order by orderCluster
# reAssignCounty: mapping file for assigning a county to another county's cluster
# format list("countyToChange"="countyClusterToAssign")
# skipAssessmentPlots: boolean, should cluster assessment plots be skipped?
# brewPalette: character vector length-1 referencing a color scheme from brewer_pal to use
# NA means use R default color schemes
# ...: other arguments that will be passed to prepClusterCounties
# STEP 1: Get a county-level population file, with fips as 5-digit character and non-zero population
countyData <- getCountyData(selfList=list("countyFIPS"=zeroPad5), lstExclude=list("pop"=c(0)))
# If a log file is requested, create the log file (allows for append=TRUE for all downstream functions)
if (!is.null(writeLog)) genNewLog(writeLog=writeLog, ovrwriteLog=ovrwriteLog)
# Get the data types to be used (elements of readFrom) and create a file storage list
elemUsed <- names(readFrom)
dfRawList <- vector("list", length=length(elemUsed)) %>% purrr::set_names(elemUsed)
dfProcessList <- vector("list", length=length(elemUsed)) %>% purrr::set_names(elemUsed)
# Steps 2-4 are required only if dfPerCapita has not been passed
if (is.null(dfPerCapita)) {
# STEP 2: Download and QC each requested data element
for (elem in elemUsed) {
dfRawList[[elem]] <- readQCRawUSAF(fileName=readFrom[[elem]],
writeLog=writeLog,
ovrwriteLog=FALSE,
urlType=elem,
getData=if(is.na(downloadTo[[elem]])) FALSE else TRUE,
dfRef=compareFile[[elem]]
)
glimpseLog(dfRawList[[elem]], txt=paste0("\nRaw file for ", elem, ":\n"), logFile=writeLog)
}
# STEP 3: Process all requested data
for (elem in elemUsed) {
dfProcessList[[elem]] <- processRawFile(dfRawList[[elem]],
vecRename=c(),
vecSelect=vecSelectMapper[[elem]],
lstCombo=lstComboMapper[[elem]],
lstFilter=lstFilterMapper[[elem]],
lstExclude=lstExcludeMapper[[elem]]
)
glimpseLog(dfProcessList[[elem]], txt=paste0("\nProcessed for ", elem, ":\n"), logFile=writeLog)
}
# STEP 4: Integrate to create a per-capita data file
dfPerCapita <- createPerCapita(dfProcessList,
uqBy=c("countyFIPS", "state", "date"),
popData=countyData,
popJoinBy=c("countyFIPS", "state"),
mapper=perCapMapper
)
glimpseLog(dfPerCapita, txt="\nIntegrated per capita data file:\n", logFile=writeLog)
} else {
dfRawList <- NULL
dfProcessList <- NULL
}
# STEP 5: Create clusters (if requested)
if (isTRUE(createClusters)) {
clData <- clusterCounties(dfPerCapita=dfPerCapita,
hierarchical=hierarchical,
minPopCluster=minPopCluster,
...
)
useClusters <- getCountyClusters(clData,
hier=hierarchical,
kCut=kCut,
reAssign=reAssignCounty,
defaultCluster=defaultCluster
)
} else {
clData <- NULL
useClusters <- NULL # Should fix this to allow passing of clusters
}
# STEP 6: Assess clusters
if (skipAssessmentPlots) {
plotDataList <- NULL
} else {
}
# Return statement, still need to update Step 6 (cluster assessment)
return(list(countyData=countyData,
dfRaw=dfRawList,
dfProcess=dfProcessList,
dfPerCapita=dfPerCapita,
useClusters=useClusters,
maxDate=maxDate,
plotDataList=plotDataList
)
)
}
The function is tested as it evolves:
testCluster <- clusterCounties(dfPerCapita=readFromRDS("cty_20201026")$clusterData$countyDailyPerCapita,
minPopCluster=25000,
hierarchical=NA,
minShape="2020-04",
maxShape="2020-09",
ratioDeathvsCase = 0.001,
ratioTotalvsShape = 0.25,
minDeath=100,
minCase=5000,
hmlSegs=3,
eslSegs=3,
seed=2010261358
)
##
## *** File has been checked for uniqueness by: state date
table(testCluster$objCluster$objCluster, readFromRDS("cty_20201026")$clustVec)
##
## 1 2 3 4 5 6 7 8 9
## 1 217 0 0 0 0 0 0 0 0
## 2 0 113 0 0 0 0 0 0 0
## 3 0 0 324 0 0 0 0 0 0
## 4 0 0 0 39 0 0 0 0 0
## 5 0 0 0 0 36 0 0 0 0
## 6 0 0 0 0 0 151 0 0 0
## 7 0 0 0 0 0 0 254 0 0
## 8 0 0 0 0 0 0 0 209 0
## 9 0 0 0 0 0 0 0 0 248
identical(names(testCluster$objCluster$objCluster), names(readFromRDS("cty_20201026")$clustVec))
## [1] TRUE
vecTestCluster_001 <- getCountyClusters(testCluster, hierarchical=NA, defaultCluster="999")
vecTestCluster_002 <- getCountyClusters(testCluster, hierarchical=NA)
usmap::plot_usmap(regions="counties",
data=vecToTibble(vecTestCluster_001, colNameName="fips"),
values="value"
)
usmap::plot_usmap(regions="counties",
data=vecToTibble(vecTestCluster_002, colNameName="fips") %>% mutate(value=factor(value)),
values="value"
)
testDFRefDeath <- readQCRawUSAF("./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20200917.csv",
getData=FALSE,
urlType="usafDeath"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20200917.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
usafDeathTest <- readQCRawUSAF("./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210104.csv",
getData=FALSE,
urlType="usafDeath",
dfRef=testDFRefDeath
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20210104.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 109
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## [1] date name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
## ***Differences of at least 0 and at least 0.1%
##
## countyFIPS countyName state name newValue refValue
## 1 02050 Bethel Census Area AK new_deaths 0 1
## 2 02195 Petersburg Census Area AK new_deaths 0 1
## 3 55078 Menominee County WI new_deaths 0 1
## 4 05109 Pike County AR new_deaths 1 2
## 5 18121 Parke County IN new_deaths 1 2
## 6 47153 Sequatchie County TN new_deaths 1 2
## 7 01031 Coffee County AL new_deaths 5 7
## 8 08051 Gunnison County CO new_deaths 5 7
## 9 13159 Jasper County GA new_deaths 3 4
## 10 20015 Butler County KS new_deaths 3 4
## 11 47145 Roane County TN new_deaths 3 4
## 12 48395 Robertson County TX new_deaths 3 4
## 13 36121 Wyoming County NY new_deaths 4 5
## 14 01029 Cleburne County AL new_deaths 5 6
## 15 47177 Warren County TN new_deaths 7 8
## 16 51685 Manassas Park city VA new_deaths 7 8
## 17 47139 Polk County TN new_deaths 10 11
## 18 54099 Wayne County WV new_deaths 10 11
## 19 45001 Abbeville County SC new_deaths 11 12
## 20 47045 Dyer County TN new_deaths 12 13
## 21 13027 Brooks County GA new_deaths 25 27
## 22 29145 Newton County MO new_deaths 13 14
## 23 41071 Yamhill County OR new_deaths 13 14
## 24 18123 Perry County IN new_deaths 14 15
## 25 06005 Amador County CA new_deaths 15 16
## 26 21037 Campbell County KY new_deaths 15 16
## 27 45011 Barnwell County SC new_deaths 16 17
## 28 01087 Macon County AL new_deaths 18 19
## 29 12051 Hendry County FL new_deaths 40 42
## 30 01041 Crenshaw County AL new_deaths 30 31
## 31 26087 Lapeer County MI new_deaths 36 37
## 32 25003 Berkshire County MA new_deaths 48 47
## 33 04003 Cochise County AZ new_deaths 68 69
## 34 04025 Yavapai County AZ new_deaths 79 80
## 35 45035 Dorchester County SC new_deaths 87 88
## 36 34033 Salem County NJ new_deaths 88 89
## 37 26081 Kent County MI new_deaths 172 173
## 38 34037 Sussex County NJ new_deaths 197 198
## 39 26049 Genesee County MI new_deaths 306 307
## 40 48453 Travis County TX new_deaths 395 396
## 41 36061 New York County NY new_deaths 3173 3177
## 42 34027 Morris County NJ new_deaths 830 831
## 43 02195 Petersburg Census Area AK deaths 0 164
## 44 02050 Bethel Census Area AK deaths 0 25
## 45 55078 Menominee County WI deaths 0 1
## 46 18121 Parke County IN deaths 48 74
## 47 47153 Sequatchie County TN deaths 36 49
## 48 36121 Wyoming County NY deaths 641 788
## 49 08051 Gunnison County CO deaths 777 947
## 50 48395 Robertson County TX deaths 126 146
## 51 01031 Coffee County AL deaths 400 461
## 52 20015 Butler County KS deaths 80 88
## 53 13159 Jasper County GA deaths 175 190
## 54 05109 Pike County AR deaths 51 54
## 55 47145 Roane County TN deaths 103 107
## 56 47139 Polk County TN deaths 356 365
## 57 41071 Yamhill County OR deaths 1478 1503
## 58 01029 Cleburne County AL deaths 242 246
## 59 18123 Perry County IN deaths 1010 1025
## 60 45001 Abbeville County SC deaths 467 473
## 61 01087 Macon County AL deaths 1333 1350
## 62 47177 Warren County TN deaths 319 323
## 63 06005 Amador County CA deaths 343 347
## 64 51685 Manassas Park city VA deaths 789 795
## 65 45011 Barnwell County SC deaths 571 574
## 66 01041 Crenshaw County AL deaths 1034 1039
## 67 21037 Campbell County KY deaths 1846 1852
## 68 29145 Newton County MO deaths 620 622
## 69 13027 Brooks County GA deaths 1912 1917
## 70 12051 Hendry County FL deaths 3629 3637
## 71 47045 Dyer County TN deaths 527 528
## 72 34033 Salem County NJ deaths 9058 9075
## 73 54099 Wayne County WV deaths 1317 1319
## 74 26087 Lapeer County MI deaths 4767 4774
## absDelta pctDelta
## 1 1 2.000000000
## 2 1 2.000000000
## 3 1 2.000000000
## 4 1 0.666666667
## 5 1 0.666666667
## 6 1 0.666666667
## 7 2 0.333333333
## 8 2 0.333333333
## 9 1 0.285714286
## 10 1 0.285714286
## 11 1 0.285714286
## 12 1 0.285714286
## 13 1 0.222222222
## 14 1 0.181818182
## 15 1 0.133333333
## 16 1 0.133333333
## 17 1 0.095238095
## 18 1 0.095238095
## 19 1 0.086956522
## 20 1 0.080000000
## 21 2 0.076923077
## 22 1 0.074074074
## 23 1 0.074074074
## 24 1 0.068965517
## 25 1 0.064516129
## 26 1 0.064516129
## 27 1 0.060606061
## 28 1 0.054054054
## 29 2 0.048780488
## 30 1 0.032786885
## 31 1 0.027397260
## 32 1 0.021052632
## 33 1 0.014598540
## 34 1 0.012578616
## 35 1 0.011428571
## 36 1 0.011299435
## 37 1 0.005797101
## 38 1 0.005063291
## 39 1 0.003262643
## 40 1 0.002528445
## 41 4 0.001259843
## 42 1 0.001204094
## 43 164 2.000000000
## 44 25 2.000000000
## 45 1 2.000000000
## 46 26 0.426229508
## 47 13 0.305882353
## 48 147 0.205738279
## 49 170 0.197215777
## 50 20 0.147058824
## 51 61 0.141695703
## 52 8 0.095238095
## 53 15 0.082191781
## 54 3 0.057142857
## 55 4 0.038095238
## 56 9 0.024965326
## 57 25 0.016772895
## 58 4 0.016393443
## 59 15 0.014742015
## 60 6 0.012765957
## 61 17 0.012672382
## 62 4 0.012461059
## 63 4 0.011594203
## 64 6 0.007575758
## 65 3 0.005240175
## 66 5 0.004823927
## 67 6 0.003244997
## 68 2 0.003220612
## 69 5 0.002611648
## 70 8 0.002202037
## 71 1 0.001895735
## 72 17 0.001875034
## 73 2 0.001517451
## 74 7 0.001467351
testDFRefCase <- readQCRawUSAF("./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20200917.csv",
getData=FALSE,
urlType="usafCase"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20200917.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
usafCaseTest <- readQCRawUSAF("./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210104.csv",
getData=FALSE,
urlType="usafCase",
dfRef=testDFRefCase
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20210104.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 109
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## [1] date name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
## ***Differences of at least 0 and at least 0.1%
##
## countyFIPS countyName state name newValue refValue
## 1 00000 Statewide Unallocated NC new_cases 0 1166
## 2 35021 Harding County NM new_cases 1 2
## 3 02016 Aleutians West Census Area AK new_cases 5 7
## 4 31171 Thomas County NE new_cases 4 5
## 5 35003 Catron County NM new_cases 9 11
## 6 20183 Smith County KS new_cases 5 6
## 7 48393 Roberts County TX new_cases 9 10
## 8 20161 Riley County KS new_cases 1262 1374
## 9 48417 Shackelford County TX new_cases 23 25
## 10 27069 Kittson County MN new_cases 12 13
## 11 20115 Marion County KS new_cases 86 93
## 12 49039 Sanpete County UT new_cases 172 185
## 13 20117 Marshall County KS new_cases 17 18
## 14 48437 Swisher County TX new_cases 97 102
## 15 50019 Orleans County VT new_cases 21 22
## 16 20053 Ellsworth County KS new_cases 26 27
## 17 29181 Ripley County MO new_cases 132 137
## 18 20055 Finney County KS new_cases 1893 1963
## 19 41037 Lake County OR new_cases 28 29
## 20 21161 Mason County KY new_cases 114 118
## 21 48283 La Salle County TX new_cases 346 358
## 22 29171 Putnam County MO new_cases 29 30
## 23 17009 Brown County IL new_cases 30 31
## 24 49041 Sevier County UT new_cases 94 97
## 25 31135 Perkins County NE new_cases 32 33
## 26 21189 Owsley County KY new_cases 36 37
## 27 49023 Juab County UT new_cases 110 113
## 28 51735 Poquoson city VA new_cases 74 76
## 29 08081 Moffat County CO new_cases 37 38
## 30 53043 Lincoln County WA new_cases 38 39
## 31 23003 Aroostook County ME new_cases 41 42
## 32 42123 Warren County PA new_cases 42 43
## 33 54065 Morgan County WV new_cases 45 46
## 34 27089 Marshall County MN new_cases 48 49
## 35 21127 Lawrence County KY new_cases 51 52
## 36 42117 Tioga County PA new_cases 52 53
## 37 15007 Kauai County HI new_cases 57 58
## 38 21231 Wayne County KY new_cases 122 124
## 39 29033 Carroll County MO new_cases 126 128
## 40 37055 Dare County NC new_cases 255 259
## 41 51610 Falls Church city VA new_cases 66 67
## 42 17059 Gallatin County IL new_cases 67 68
## 43 26011 Arenac County MI new_cases 67 68
## 44 48009 Archer County TX new_cases 67 68
## 45 48275 Knox County TX new_cases 67 68
## 46 48259 Kendall County TX new_cases 202 205
## 47 24023 Garrett County MD new_cases 70 71
## 48 24029 Kent County MD new_cases 289 293
## 49 48025 Bee County TX new_cases 1791 1815
## 50 18101 Martin County IN new_cases 151 153
## 51 05039 Dallas County AR new_cases 152 154
## 52 47033 Crockett County TN new_cases 481 487
## 53 23025 Somerset County ME new_cases 84 85
## 54 48001 Anderson County TX new_cases 2744 2776
## 55 18007 Benton County IN new_cases 86 87
## 56 41001 Baker County OR new_cases 89 90
## 57 24510 Baltimore City MD new_cases 15085 15236
## 58 21053 Clinton County KY new_cases 100 101
## 59 48057 Calhoun County TX new_cases 617 623
## 60 30053 Lincoln County MT new_cases 104 105
## 61 46127 Union County SD new_cases 329 332
## 62 48019 Bandera County TX new_cases 112 113
## 63 51073 Gloucester County VA new_cases 225 227
## 64 27021 Cass County MN new_cases 117 118
## 65 32013 Humboldt County NV new_cases 117 118
## 66 22043 Grant Parish LA new_cases 470 474
## 67 20077 Harper County KS new_cases 119 120
## 68 50021 Rutland County VT new_cases 128 129
## 69 55023 Crawford County WI new_cases 135 136
## 70 47065 Hamilton County TN new_cases 8859 8924
## 71 22059 La Salle Parish LA new_cases 412 415
## 72 21229 Washington County KY new_cases 139 140
## 73 54063 Monroe County WV new_cases 143 144
## 74 13071 Colquitt County GA new_cases 1871 1883
## 75 21031 Butler County KY new_cases 329 331
## 76 20191 Sumner County KS new_cases 166 167
## 77 21141 Logan County KY new_cases 528 531
## 78 48313 Madison County TX new_cases 713 717
## 79 27039 Dodge County MN new_cases 181 182
## 80 48481 Wharton County TX new_cases 1278 1285
## 81 29211 Sullivan County MO new_cases 184 185
## 82 22091 St. Helena Parish LA new_cases 394 396
## 83 18123 Perry County IN new_cases 210 211
## 84 48265 Kerr County TX new_cases 442 444
## 85 20085 Jackson County KS new_cases 225 226
## 86 05053 Grant County AR new_cases 226 227
## 87 29021 Buchanan County MO new_cases 1591 1598
## 88 41041 Lincoln County OR new_cases 470 472
## 89 47181 Wayne County TN new_cases 1471 1477
## 90 08067 La Plata County CO new_cases 247 248
## 91 37149 Polk County NC new_cases 253 254
## 92 21045 Casey County KY new_cases 264 265
## 93 22021 Caldwell Parish LA new_cases 297 298
## 94 18151 Steuben County IN new_cases 343 344
## 95 30029 Flathead County MT new_cases 734 736
## 96 51670 Hopewell city VA new_cases 372 373
## 97 48177 Gonzales County TX new_cases 822 824
## 98 42119 Union County PA new_cases 438 439
## 99 22013 Bienville Parish LA new_cases 444 445
## 100 34041 Warren County NJ new_cases 1422 1425
## 101 48187 Guadalupe County TX new_cases 1948 1952
## 102 22113 Vermilion Parish LA new_cases 1976 1980
## 103 18133 Putnam County IN new_cases 513 514
## 104 13169 Jones County GA new_cases 514 515
## 105 01131 Wilcox County AL new_cases 529 530
## 106 01105 Perry County AL new_cases 535 536
## 107 48253 Jones County TX new_cases 565 566
## 108 22127 Winn Parish LA new_cases 597 598
## 109 48361 Orange County TX new_cases 1844 1847
## 110 49037 San Juan County UT new_cases 683 684
## 111 01091 Marengo County AL new_cases 693 694
## 112 48285 Lavaca County TX new_cases 705 706
## 113 48255 Karnes County TX new_cases 752 753
## 114 51810 Virginia Beach City VA new_cases 6515 6523
## 115 34009 Cape May County NJ new_cases 981 982
## 116 00000 Statewide Unallocated NC cases 25112 42602
## 117 35021 Harding County NM cases 146 177
## 118 02016 Aleutians West Census Area AK cases 250 279
## 119 31171 Thomas County NE cases 205 217
## 120 35003 Catron County NM cases 525 547
## 121 48283 La Salle County TX cases 21806 22382
## 122 41037 Lake County OR cases 2243 2302
## 123 20161 Riley County KS cases 51360 51968
## 124 49039 Sanpete County UT cases 10292 10391
## 125 21189 Owsley County KY cases 1408 1419
## 126 29171 Putnam County MO cases 1058 1063
## 127 20183 Smith County KS cases 424 426
## 128 21161 Mason County KY cases 5828 5848
## 129 20115 Marion County KS cases 4456 4470
## 130 50019 Orleans County VT cases 2109 2115
## 131 49041 Sevier County UT cases 7068 7086
## 132 20117 Marshall County KS cases 792 794
## 133 27069 Kittson County MN cases 408 409
## 134 48437 Swisher County TX cases 7060 7077
## 135 20055 Finney County KS cases 216557 217002
## 136 49023 Juab County UT cases 6162 6174
## 137 21127 Lawrence County KY cases 2649 2654
## 138 15007 Kauai County HI cases 5802 5812
## 139 48393 Roberts County TX cases 655 656
## 140 20077 Harper County KS cases 3299 3304
## 141 48417 Shackelford County TX cases 1325 1327
## 142 20053 Ellsworth County KS cases 1567 1569
## 143 41001 Baker County OR cases 3290 3294
## 144 17059 Gallatin County IL cases 3332 3336
## 145 05039 Dallas County AR cases 5391 5397
## 146 27089 Marshall County MN cases 2781 2784
## 147 48025 Bee County TX cases 72547 72624
## absDelta pctDelta
## 1 1166 2.000000000
## 2 1 0.666666667
## 3 2 0.333333333
## 4 1 0.222222222
## 5 2 0.200000000
## 6 1 0.181818182
## 7 1 0.105263158
## 8 112 0.084977238
## 9 2 0.083333333
## 10 1 0.080000000
## 11 7 0.078212291
## 12 13 0.072829132
## 13 1 0.057142857
## 14 5 0.050251256
## 15 1 0.046511628
## 16 1 0.037735849
## 17 5 0.037174721
## 18 70 0.036307054
## 19 1 0.035087719
## 20 4 0.034482759
## 21 12 0.034090909
## 22 1 0.033898305
## 23 1 0.032786885
## 24 3 0.031413613
## 25 1 0.030769231
## 26 1 0.027397260
## 27 3 0.026905830
## 28 2 0.026666667
## 29 1 0.026666667
## 30 1 0.025974026
## 31 1 0.024096386
## 32 1 0.023529412
## 33 1 0.021978022
## 34 1 0.020618557
## 35 1 0.019417476
## 36 1 0.019047619
## 37 1 0.017391304
## 38 2 0.016260163
## 39 2 0.015748031
## 40 4 0.015564202
## 41 1 0.015037594
## 42 1 0.014814815
## 43 1 0.014814815
## 44 1 0.014814815
## 45 1 0.014814815
## 46 3 0.014742015
## 47 1 0.014184397
## 48 4 0.013745704
## 49 24 0.013311148
## 50 2 0.013157895
## 51 2 0.013071895
## 52 6 0.012396694
## 53 1 0.011834320
## 54 32 0.011594203
## 55 1 0.011560694
## 56 1 0.011173184
## 57 151 0.009960094
## 58 1 0.009950249
## 59 6 0.009677419
## 60 1 0.009569378
## 61 3 0.009077156
## 62 1 0.008888889
## 63 2 0.008849558
## 64 1 0.008510638
## 65 1 0.008510638
## 66 4 0.008474576
## 67 1 0.008368201
## 68 1 0.007782101
## 69 1 0.007380074
## 70 65 0.007310353
## 71 3 0.007255139
## 72 1 0.007168459
## 73 1 0.006968641
## 74 12 0.006393181
## 75 2 0.006060606
## 76 1 0.006006006
## 77 3 0.005665722
## 78 4 0.005594406
## 79 1 0.005509642
## 80 7 0.005462349
## 81 1 0.005420054
## 82 2 0.005063291
## 83 1 0.004750594
## 84 2 0.004514673
## 85 1 0.004434590
## 86 1 0.004415011
## 87 7 0.004390091
## 88 2 0.004246285
## 89 6 0.004070556
## 90 1 0.004040404
## 91 1 0.003944773
## 92 1 0.003780718
## 93 1 0.003361345
## 94 1 0.002911208
## 95 2 0.002721088
## 96 1 0.002684564
## 97 2 0.002430134
## 98 1 0.002280502
## 99 1 0.002249719
## 100 3 0.002107482
## 101 4 0.002051282
## 102 4 0.002022245
## 103 1 0.001947420
## 104 1 0.001943635
## 105 1 0.001888574
## 106 1 0.001867414
## 107 1 0.001768347
## 108 1 0.001673640
## 109 3 0.001625576
## 110 1 0.001463058
## 111 1 0.001441961
## 112 1 0.001417434
## 113 1 0.001328904
## 114 8 0.001227182
## 115 1 0.001018849
## 116 17490 0.516584458
## 117 31 0.191950464
## 118 29 0.109640832
## 119 12 0.056872038
## 120 22 0.041044776
## 121 576 0.026070426
## 122 59 0.025962596
## 123 608 0.011768349
## 124 99 0.009573079
## 125 11 0.007782101
## 126 5 0.004714757
## 127 2 0.004705882
## 128 20 0.003425831
## 129 14 0.003136903
## 130 6 0.002840909
## 131 18 0.002543451
## 132 2 0.002522068
## 133 1 0.002447980
## 134 17 0.002405036
## 135 445 0.002052777
## 136 12 0.001945525
## 137 5 0.001885725
## 138 10 0.001722060
## 139 1 0.001525553
## 140 5 0.001514463
## 141 2 0.001508296
## 142 2 0.001275510
## 143 4 0.001215067
## 144 4 0.001199760
## 145 6 0.001112347
## 146 3 0.001078167
## 147 77 0.001060818
usafProcessCaseTest <- processRawFile(usafCaseTest,
vecRename=c(),
vecSelect=c("countyFIPS", "state", "date", "cases", "new_cases"),
lstCombo=list(),
lstFilter=list(),
lstExclude=list("countyFIPS"=c("00000", "00001"))
)
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 1.74e+9 2.01e+7 1108665
## 2 after 1.73e+9 2.00e+7 1090968
## 3 pctchg 5.59e-3 5.34e-3 0.0160
usafProcessDeathTest <- processRawFile(usafDeathTest,
vecRename=c(),
vecSelect=c("countyFIPS", "state", "date", "deaths", "new_deaths"),
lstCombo=list(),
lstFilter=list(),
lstExclude=list("countyFIPS"=c("00000", "00001"))
)
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 4.66e+7 343315 1108665
## 2 after 4.63e+7 341896 1090968
## 3 pctchg 5.05e-3 0.00413 0.0160
usafPerCapitaTest <- createPerCapita(list(usafProcessCaseTest, usafProcessDeathTest),
uqBy=c("countyFIPS", "state", "date"),
popData=getCountyData() %>%
colMutater(selfList=list("countyFIPS"=zeroPad5)),
mapper=c("deaths"="tdpm", "new_deaths"="dpm",
"cases"="tcpm", "new_cases"="cpm"
),
popJoinBy=c("countyFIPS", "state"),
popVar="pop"
)
The overall function is then tested, with the cluster assessments to be added later:
# Create clusters from existing data frame
testFullUSAF_v001 <- readRunUSAFacts(maxDate=NA,
dfPerCapita=readFromRDS("cty_20201026")$clusterData$countyDailyPerCapita,
createClusters=TRUE,
defaultCluster="999",
minPopCluster=25000,
skipAssessmentPlots=TRUE,
hierarchical=NA,
minShape="2020-04",
maxShape="2020-09",
ratioDeathvsCase = 0.001,
ratioTotalvsShape = 0.25,
minDeath=100,
minCase=5000,
hmlSegs=3,
eslSegs=3,
seed=2010261358
)
##
## *** File has been checked for uniqueness by: state date
identical(testFullUSAF_v001$useClusters, vecTestCluster_001)
## [1] TRUE
# Assemble dfPerCapita from components
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201026.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201026.csv"
)
compareList <- list("usafCase"=testDFRefCase,
"usafDeath"=testDFRefDeath
)
testFullUSAF_v002 <- readRunUSAFacts(maxDate=NA,
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog=NULL,
ovrWriteLog=TRUE,
createClusters=FALSE,
skipAssessmentPlots=TRUE,
defaultCluster="999",
minPopCluster=25000,
hierarchical=NA,
minShape="2020-04",
maxShape="2020-09",
ratioDeathvsCase = 0.001,
ratioTotalvsShape = 0.25,
minDeath=100,
minCase=5000,
hmlSegs=3,
eslSegs=3,
seed=2010261358
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201026.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 39
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## [1] date name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
## ***Differences of at least 0 and at least 0.1%
##
## countyFIPS countyName state name newValue refValue
## 1 35021 Harding County NM new_cases 1 2
## 2 02016 Aleutians West Census Area AK new_cases 5 7
## 3 31171 Thomas County NE new_cases 4 5
## 4 35003 Catron County NM new_cases 9 11
## 5 20183 Smith County KS new_cases 5 6
## 6 48393 Roberts County TX new_cases 9 10
## 7 20161 Riley County KS new_cases 1262 1374
## 8 48417 Shackelford County TX new_cases 23 25
## 9 27069 Kittson County MN new_cases 12 13
## 10 20115 Marion County KS new_cases 86 93
## 11 49039 Sanpete County UT new_cases 172 185
## 12 20117 Marshall County KS new_cases 17 18
## 13 48437 Swisher County TX new_cases 97 102
## 14 50019 Orleans County VT new_cases 21 22
## 15 20053 Ellsworth County KS new_cases 26 27
## 16 29181 Ripley County MO new_cases 132 137
## 17 20055 Finney County KS new_cases 1893 1963
## 18 41037 Lake County OR new_cases 28 29
## 19 21161 Mason County KY new_cases 114 118
## 20 48283 La Salle County TX new_cases 346 358
## 21 29171 Putnam County MO new_cases 29 30
## 22 17009 Brown County IL new_cases 30 31
## 23 49041 Sevier County UT new_cases 94 97
## 24 31135 Perkins County NE new_cases 32 33
## 25 21189 Owsley County KY new_cases 36 37
## 26 49023 Juab County UT new_cases 110 113
## 27 51735 Poquoson city VA new_cases 74 76
## 28 08081 Moffat County CO new_cases 37 38
## 29 53043 Lincoln County WA new_cases 38 39
## 30 23003 Aroostook County ME new_cases 41 42
## 31 42123 Warren County PA new_cases 42 43
## 32 54065 Morgan County WV new_cases 45 46
## 33 27089 Marshall County MN new_cases 48 49
## 34 21127 Lawrence County KY new_cases 51 52
## 35 42117 Tioga County PA new_cases 52 53
## 36 15007 Kauai County HI new_cases 57 58
## 37 21231 Wayne County KY new_cases 122 124
## 38 29033 Carroll County MO new_cases 126 128
## 39 37055 Dare County NC new_cases 255 259
## 40 51610 Falls Church city VA new_cases 66 67
## 41 17059 Gallatin County IL new_cases 67 68
## 42 26011 Arenac County MI new_cases 67 68
## 43 48009 Archer County TX new_cases 67 68
## 44 48275 Knox County TX new_cases 67 68
## 45 48259 Kendall County TX new_cases 202 205
## 46 24023 Garrett County MD new_cases 70 71
## 47 24029 Kent County MD new_cases 289 293
## 48 48025 Bee County TX new_cases 1791 1815
## 49 18101 Martin County IN new_cases 151 153
## 50 05039 Dallas County AR new_cases 152 154
## 51 47033 Crockett County TN new_cases 481 487
## 52 23025 Somerset County ME new_cases 84 85
## 53 48001 Anderson County TX new_cases 2744 2776
## 54 18007 Benton County IN new_cases 86 87
## 55 41001 Baker County OR new_cases 89 90
## 56 24510 Baltimore City MD new_cases 15085 15236
## 57 21053 Clinton County KY new_cases 100 101
## 58 48057 Calhoun County TX new_cases 617 623
## 59 30053 Lincoln County MT new_cases 104 105
## 60 46127 Union County SD new_cases 329 332
## 61 48019 Bandera County TX new_cases 112 113
## 62 51073 Gloucester County VA new_cases 225 227
## 63 27021 Cass County MN new_cases 117 118
## 64 32013 Humboldt County NV new_cases 117 118
## 65 22043 Grant Parish LA new_cases 470 474
## 66 20077 Harper County KS new_cases 119 120
## 67 50021 Rutland County VT new_cases 128 129
## 68 55023 Crawford County WI new_cases 135 136
## 69 47065 Hamilton County TN new_cases 8859 8924
## 70 22059 La Salle Parish LA new_cases 412 415
## 71 21229 Washington County KY new_cases 139 140
## 72 54063 Monroe County WV new_cases 143 144
## 73 13071 Colquitt County GA new_cases 1871 1883
## 74 21031 Butler County KY new_cases 329 331
## 75 20191 Sumner County KS new_cases 166 167
## 76 21141 Logan County KY new_cases 528 531
## 77 48313 Madison County TX new_cases 713 717
## 78 27039 Dodge County MN new_cases 181 182
## 79 48481 Wharton County TX new_cases 1278 1285
## 80 29211 Sullivan County MO new_cases 184 185
## 81 22091 St. Helena Parish LA new_cases 394 396
## 82 18123 Perry County IN new_cases 210 211
## 83 48265 Kerr County TX new_cases 442 444
## 84 20085 Jackson County KS new_cases 225 226
## 85 05053 Grant County AR new_cases 226 227
## 86 29021 Buchanan County MO new_cases 1591 1598
## 87 41041 Lincoln County OR new_cases 470 472
## 88 47181 Wayne County TN new_cases 1471 1477
## 89 08067 La Plata County CO new_cases 247 248
## 90 37149 Polk County NC new_cases 253 254
## 91 21045 Casey County KY new_cases 264 265
## 92 22021 Caldwell Parish LA new_cases 297 298
## 93 18151 Steuben County IN new_cases 343 344
## 94 30029 Flathead County MT new_cases 734 736
## 95 51670 Hopewell city VA new_cases 372 373
## 96 48177 Gonzales County TX new_cases 822 824
## 97 42119 Union County PA new_cases 438 439
## 98 22013 Bienville Parish LA new_cases 444 445
## 99 34041 Warren County NJ new_cases 1422 1425
## 100 48187 Guadalupe County TX new_cases 1948 1952
## 101 22113 Vermilion Parish LA new_cases 1976 1980
## 102 18133 Putnam County IN new_cases 513 514
## 103 13169 Jones County GA new_cases 514 515
## 104 01131 Wilcox County AL new_cases 529 530
## 105 01105 Perry County AL new_cases 535 536
## 106 48253 Jones County TX new_cases 565 566
## 107 22127 Winn Parish LA new_cases 597 598
## 108 48361 Orange County TX new_cases 1844 1847
## 109 49037 San Juan County UT new_cases 683 684
## 110 01091 Marengo County AL new_cases 693 694
## 111 48285 Lavaca County TX new_cases 705 706
## 112 48255 Karnes County TX new_cases 752 753
## 113 51810 Virginia Beach City VA new_cases 6515 6523
## 114 34009 Cape May County NJ new_cases 981 982
## 115 35021 Harding County NM cases 146 177
## 116 02016 Aleutians West Census Area AK cases 250 279
## 117 31171 Thomas County NE cases 205 217
## 118 35003 Catron County NM cases 525 547
## 119 48283 La Salle County TX cases 21806 22382
## 120 41037 Lake County OR cases 2243 2302
## 121 20161 Riley County KS cases 51360 51968
## 122 49039 Sanpete County UT cases 10292 10391
## 123 21189 Owsley County KY cases 1408 1419
## 124 29171 Putnam County MO cases 1058 1063
## 125 20183 Smith County KS cases 424 426
## 126 21161 Mason County KY cases 5828 5848
## 127 20115 Marion County KS cases 4456 4470
## 128 50019 Orleans County VT cases 2109 2115
## 129 49041 Sevier County UT cases 7068 7086
## 130 20117 Marshall County KS cases 792 794
## 131 27069 Kittson County MN cases 408 409
## 132 48437 Swisher County TX cases 7060 7077
## 133 20055 Finney County KS cases 216557 217002
## 134 49023 Juab County UT cases 6162 6174
## 135 21127 Lawrence County KY cases 2649 2654
## 136 15007 Kauai County HI cases 5802 5812
## 137 48393 Roberts County TX cases 655 656
## 138 20077 Harper County KS cases 3299 3304
## 139 48417 Shackelford County TX cases 1325 1327
## 140 20053 Ellsworth County KS cases 1567 1569
## 141 41001 Baker County OR cases 3290 3294
## 142 17059 Gallatin County IL cases 3332 3336
## 143 05039 Dallas County AR cases 5391 5397
## 144 27089 Marshall County MN cases 2781 2784
## 145 48025 Bee County TX cases 72547 72624
## absDelta pctDelta
## 1 1 0.666666667
## 2 2 0.333333333
## 3 1 0.222222222
## 4 2 0.200000000
## 5 1 0.181818182
## 6 1 0.105263158
## 7 112 0.084977238
## 8 2 0.083333333
## 9 1 0.080000000
## 10 7 0.078212291
## 11 13 0.072829132
## 12 1 0.057142857
## 13 5 0.050251256
## 14 1 0.046511628
## 15 1 0.037735849
## 16 5 0.037174721
## 17 70 0.036307054
## 18 1 0.035087719
## 19 4 0.034482759
## 20 12 0.034090909
## 21 1 0.033898305
## 22 1 0.032786885
## 23 3 0.031413613
## 24 1 0.030769231
## 25 1 0.027397260
## 26 3 0.026905830
## 27 2 0.026666667
## 28 1 0.026666667
## 29 1 0.025974026
## 30 1 0.024096386
## 31 1 0.023529412
## 32 1 0.021978022
## 33 1 0.020618557
## 34 1 0.019417476
## 35 1 0.019047619
## 36 1 0.017391304
## 37 2 0.016260163
## 38 2 0.015748031
## 39 4 0.015564202
## 40 1 0.015037594
## 41 1 0.014814815
## 42 1 0.014814815
## 43 1 0.014814815
## 44 1 0.014814815
## 45 3 0.014742015
## 46 1 0.014184397
## 47 4 0.013745704
## 48 24 0.013311148
## 49 2 0.013157895
## 50 2 0.013071895
## 51 6 0.012396694
## 52 1 0.011834320
## 53 32 0.011594203
## 54 1 0.011560694
## 55 1 0.011173184
## 56 151 0.009960094
## 57 1 0.009950249
## 58 6 0.009677419
## 59 1 0.009569378
## 60 3 0.009077156
## 61 1 0.008888889
## 62 2 0.008849558
## 63 1 0.008510638
## 64 1 0.008510638
## 65 4 0.008474576
## 66 1 0.008368201
## 67 1 0.007782101
## 68 1 0.007380074
## 69 65 0.007310353
## 70 3 0.007255139
## 71 1 0.007168459
## 72 1 0.006968641
## 73 12 0.006393181
## 74 2 0.006060606
## 75 1 0.006006006
## 76 3 0.005665722
## 77 4 0.005594406
## 78 1 0.005509642
## 79 7 0.005462349
## 80 1 0.005420054
## 81 2 0.005063291
## 82 1 0.004750594
## 83 2 0.004514673
## 84 1 0.004434590
## 85 1 0.004415011
## 86 7 0.004390091
## 87 2 0.004246285
## 88 6 0.004070556
## 89 1 0.004040404
## 90 1 0.003944773
## 91 1 0.003780718
## 92 1 0.003361345
## 93 1 0.002911208
## 94 2 0.002721088
## 95 1 0.002684564
## 96 2 0.002430134
## 97 1 0.002280502
## 98 1 0.002249719
## 99 3 0.002107482
## 100 4 0.002051282
## 101 4 0.002022245
## 102 1 0.001947420
## 103 1 0.001943635
## 104 1 0.001888574
## 105 1 0.001867414
## 106 1 0.001768347
## 107 1 0.001673640
## 108 3 0.001625576
## 109 1 0.001463058
## 110 1 0.001441961
## 111 1 0.001417434
## 112 1 0.001328904
## 113 8 0.001227182
## 114 1 0.001018849
## 115 31 0.191950464
## 116 29 0.109640832
## 117 12 0.056872038
## 118 22 0.041044776
## 119 576 0.026070426
## 120 59 0.025962596
## 121 608 0.011768349
## 122 99 0.009573079
## 123 11 0.007782101
## 124 5 0.004714757
## 125 2 0.004705882
## 126 20 0.003425831
## 127 14 0.003136903
## 128 6 0.002840909
## 129 18 0.002543451
## 130 2 0.002522068
## 131 1 0.002447980
## 132 17 0.002405036
## 133 445 0.002052777
## 134 12 0.001945525
## 135 5 0.001885725
## 136 10 0.001722060
## 137 1 0.001525553
## 138 5 0.001514463
## 139 2 0.001508296
## 140 2 0.001275510
## 141 4 0.001215067
## 142 4 0.001199760
## 143 6 0.001112347
## 144 3 0.001078167
## 145 77 0.001060818
##
##
##
## Raw file for usafCase:
## Rows: 885,015
## Columns: 7
## $ countyFIPS <chr> "00000", "00000", "00000", "00000", "00000", "00000", "0000~
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "Statewid~
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",~
## $ stateFIPS <chr> "01", "01", "01", "01", "01", "01", "01", "01", "01", "01",~
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-26~
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ new_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201026.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 39
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## [1] date name newValue refValue absDelta pctDelta
## <0 rows> (or 0-length row.names)
##
##
## ***Differences of at least 0 and at least 0.1%
##
## countyFIPS countyName state name newValue refValue absDelta
## 1 02050 Bethel Census Area AK new_deaths 0 1 1
## 2 55078 Menominee County WI new_deaths 0 1 1
## 3 05109 Pike County AR new_deaths 1 2 1
## 4 18121 Parke County IN new_deaths 1 2 1
## 5 47153 Sequatchie County TN new_deaths 1 2 1
## 6 01031 Coffee County AL new_deaths 5 7 2
## 7 13159 Jasper County GA new_deaths 3 4 1
## 8 20015 Butler County KS new_deaths 3 4 1
## 9 47145 Roane County TN new_deaths 3 4 1
## 10 48395 Robertson County TX new_deaths 3 4 1
## 11 36121 Wyoming County NY new_deaths 4 5 1
## 12 01029 Cleburne County AL new_deaths 5 6 1
## 13 47177 Warren County TN new_deaths 7 8 1
## 14 51685 Manassas Park city VA new_deaths 7 8 1
## 15 47139 Polk County TN new_deaths 10 11 1
## 16 54099 Wayne County WV new_deaths 10 11 1
## 17 45001 Abbeville County SC new_deaths 11 12 1
## 18 47045 Dyer County TN new_deaths 12 13 1
## 19 13027 Brooks County GA new_deaths 25 27 2
## 20 29145 Newton County MO new_deaths 13 14 1
## 21 41071 Yamhill County OR new_deaths 13 14 1
## 22 18123 Perry County IN new_deaths 14 15 1
## 23 06005 Amador County CA new_deaths 15 16 1
## 24 45011 Barnwell County SC new_deaths 16 17 1
## 25 01087 Macon County AL new_deaths 18 19 1
## 26 12051 Hendry County FL new_deaths 40 42 2
## 27 01041 Crenshaw County AL new_deaths 30 31 1
## 28 26087 Lapeer County MI new_deaths 36 37 1
## 29 25003 Berkshire County MA new_deaths 48 47 1
## 30 04003 Cochise County AZ new_deaths 68 69 1
## 31 04025 Yavapai County AZ new_deaths 79 80 1
## 32 45035 Dorchester County SC new_deaths 87 88 1
## 33 34033 Salem County NJ new_deaths 88 89 1
## 34 26081 Kent County MI new_deaths 172 173 1
## 35 34037 Sussex County NJ new_deaths 197 198 1
## 36 26049 Genesee County MI new_deaths 306 307 1
## 37 48453 Travis County TX new_deaths 395 396 1
## 38 36061 New York County NY new_deaths 3173 3177 4
## 39 34027 Morris County NJ new_deaths 830 831 1
## 40 02050 Bethel Census Area AK deaths 0 25 25
## 41 55078 Menominee County WI deaths 0 1 1
## 42 18121 Parke County IN deaths 48 74 26
## 43 47153 Sequatchie County TN deaths 36 49 13
## 44 36121 Wyoming County NY deaths 641 788 147
## 45 48395 Robertson County TX deaths 126 146 20
## 46 01031 Coffee County AL deaths 400 461 61
## 47 20015 Butler County KS deaths 80 88 8
## 48 13159 Jasper County GA deaths 175 190 15
## 49 05109 Pike County AR deaths 51 54 3
## 50 47145 Roane County TN deaths 103 107 4
## 51 47139 Polk County TN deaths 356 365 9
## 52 41071 Yamhill County OR deaths 1478 1503 25
## 53 01029 Cleburne County AL deaths 242 246 4
## 54 18123 Perry County IN deaths 1010 1025 15
## 55 45001 Abbeville County SC deaths 467 473 6
## 56 01087 Macon County AL deaths 1333 1350 17
## 57 47177 Warren County TN deaths 319 323 4
## 58 06005 Amador County CA deaths 343 347 4
## 59 51685 Manassas Park city VA deaths 789 795 6
## 60 45011 Barnwell County SC deaths 571 574 3
## 61 01041 Crenshaw County AL deaths 1034 1039 5
## 62 29145 Newton County MO deaths 620 622 2
## 63 13027 Brooks County GA deaths 1912 1917 5
## 64 12051 Hendry County FL deaths 3629 3637 8
## 65 47045 Dyer County TN deaths 527 528 1
## 66 34033 Salem County NJ deaths 9058 9075 17
## 67 54099 Wayne County WV deaths 1317 1319 2
## 68 26087 Lapeer County MI deaths 4767 4774 7
## pctDelta
## 1 2.000000000
## 2 2.000000000
## 3 0.666666667
## 4 0.666666667
## 5 0.666666667
## 6 0.333333333
## 7 0.285714286
## 8 0.285714286
## 9 0.285714286
## 10 0.285714286
## 11 0.222222222
## 12 0.181818182
## 13 0.133333333
## 14 0.133333333
## 15 0.095238095
## 16 0.095238095
## 17 0.086956522
## 18 0.080000000
## 19 0.076923077
## 20 0.074074074
## 21 0.074074074
## 22 0.068965517
## 23 0.064516129
## 24 0.060606061
## 25 0.054054054
## 26 0.048780488
## 27 0.032786885
## 28 0.027397260
## 29 0.021052632
## 30 0.014598540
## 31 0.012578616
## 32 0.011428571
## 33 0.011299435
## 34 0.005797101
## 35 0.005063291
## 36 0.003262643
## 37 0.002528445
## 38 0.001259843
## 39 0.001204094
## 40 2.000000000
## 41 2.000000000
## 42 0.426229508
## 43 0.305882353
## 44 0.205738279
## 45 0.147058824
## 46 0.141695703
## 47 0.095238095
## 48 0.082191781
## 49 0.057142857
## 50 0.038095238
## 51 0.024965326
## 52 0.016772895
## 53 0.016393443
## 54 0.014742015
## 55 0.012765957
## 56 0.012672382
## 57 0.012461059
## 58 0.011594203
## 59 0.007575758
## 60 0.005240175
## 61 0.004823927
## 62 0.003220612
## 63 0.002611648
## 64 0.002202037
## 65 0.001895735
## 66 0.001875034
## 67 0.001517451
## 68 0.001467351
##
##
##
## Raw file for usafDeath:
## Rows: 885,015
## Columns: 7
## $ countyFIPS <chr> "00000", "00000", "00000", "00000", "00000", "00000", "0000~
## $ countyName <chr> "Statewide Unallocated", "Statewide Unallocated", "Statewid~
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",~
## $ stateFIPS <chr> "01", "01", "01", "01", "01", "01", "01", "01", "01", "01",~
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-26~
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ new_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 7.90e+8 8478790 885015
## 2 after 7.85e+8 8437719 870888
## 3 pctchg 6.42e-3 0.00484 0.0160
##
##
## Processed for usafCase:
## Rows: 870,888
## Columns: 5
## $ countyFIPS <chr> "01001", "01001", "01001", "01001", "01001", "01001", "0100~
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",~
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-26~
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ new_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 2.76e+7 222573 885015
## 2 after 2.74e+7 221995 870888
## 3 pctchg 6.43e-3 0.00260 0.0160
##
##
## Processed for usafDeath:
## Rows: 870,888
## Columns: 5
## $ countyFIPS <chr> "01001", "01001", "01001", "01001", "01001", "01001", "0100~
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",~
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-26~
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ new_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
##
## Integrated per capita data file:
## Rows: 870,334
## Columns: 15
## $ countyFIPS <chr> "01001", "01003", "01005", "01007", "01009", "01011", "0101~
## $ state <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",~
## $ date <date> 2020-01-22, 2020-01-22, 2020-01-22, 2020-01-22, 2020-01-22~
## $ cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ new_cases <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ new_deaths <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ tcpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ cpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ tdpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ dpm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ tcpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ cpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ tdpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
## $ dpm7 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
# Get previous per capita data for comparison
compareDataTest <- readFromRDS("cty_20201026")$clusterStateData %>%
checkUniqueRows(uniqueBy=c("fipsCounty", "date"))
##
## *** File has been checked for uniqueness by: fipsCounty date
# Only the counties with spelling mismatches should be in old but not in new
compareDataTest %>%
anti_join(testFullUSAF_v002$dfPerCapita, by=c("fipsCounty"="countyFIPS", "date"))
## # A tibble: 15 x 12
## date cpm dpm cpm7 dpm7 cases deaths fipsCounty cluster state
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <fct> <chr>
## 1 NA NA NA NA NA NA NA 08014 999 CO
## 2 NA NA NA NA NA NA NA 27073 999 MN
## 3 NA NA NA NA NA NA NA 51115 999 VA
## 4 NA NA NA NA NA NA NA 51510 999 VA
## 5 NA NA NA NA NA NA NA 51540 999 VA
## 6 NA NA NA NA NA NA NA 51550 999 VA
## 7 NA NA NA NA NA NA NA 51590 999 VA
## 8 NA NA NA NA NA NA NA 51630 999 VA
## 9 NA NA NA NA NA NA NA 51660 999 VA
## 10 NA NA NA NA NA NA NA 51683 999 VA
## 11 NA NA NA NA NA NA NA 51710 999 VA
## 12 NA NA NA NA NA NA NA 51740 999 VA
## 13 NA NA NA NA NA NA NA 51760 999 VA
## 14 NA NA NA NA NA NA NA 51800 999 VA
## 15 NA NA NA NA NA NA NA 51810 999 VA
## # ... with 2 more variables: countyName <chr>, pop <dbl>
# Primarily the counties with spelling mismatched should be in new but not in old
# Should probably delete the 02270 and 06000 which both have population 0
testFullUSAF_v002$dfPerCapita %>%
anti_join(compareDataTest, by=c("countyFIPS"="fipsCounty", "date")) %>%
count(countyFIPS, state)
## # A tibble: 15 x 3
## countyFIPS state n
## <chr> <chr> <int>
## 1 08014 CO 277
## 2 27073 MN 277
## 3 51115 VA 277
## 4 51510 VA 277
## 5 51540 VA 277
## 6 51550 VA 277
## 7 51590 VA 277
## 8 51630 VA 277
## 9 51660 VA 277
## 10 51683 VA 277
## 11 51710 VA 277
## 12 51740 VA 277
## 13 51760 VA 277
## 14 51800 VA 277
## 15 51810 VA 277
# Compare values across files
tmpCheck <- testFullUSAF_v002$dfPerCapita %>%
select(countyFIPS, date, state,
cases_usaf=new_cases, deaths_usaf=new_deaths, cpm7_usaf=cpm7, dpm7_usaf=dpm7
) %>%
inner_join(select(compareDataTest, fipsCounty, date, state,
cases_old=cases, deaths_old=deaths, cpm7_old=cpm7, dpm7_old=dpm7
),
by=c("countyFIPS"="fipsCounty", "date", "state")
) %>%
pivot_longer(-c(countyFIPS, date, state)) %>%
mutate(metric=stringr::str_split(name, pattern="_") %>% sapply("[", 1),
src=stringr::str_split(name, pattern="_") %>% sapply("[", 2)
) %>%
select(-name) %>%
pivot_wider(names_from=src, values_from=value) %>%
filter((is.na(usaf) & !is.na(old)) | (!is.na(usaf) & is.na(old)) | abs(usaf-old) > 0.000001)
tmpCheck %>% count(metric)
## # A tibble: 0 x 2
## # ... with 2 variables: metric <chr>, n <int>
The data produced are broadly the same, with the exception of the misnamed counties (better in more recent) and elimination of zero population fips (better in previous). The issue of including zero-population counties has been addressed by eliminating them from the universe in step 1 (getCountyData()).
A cluster assessment capability is also added:
# Create the integrated and aggregate data from lst
dfFullUSAF <- integrateData(lst=list("stateData"=getCountyData(selfList=list("countyFIPS"=zeroPad5),
lstExclude=list("pop"=c(0))
),
"dfPerCapita"=testFullUSAF_v002$dfPerCapita,
"useClusters"=clustersToFrame(testFullUSAF_v001$useClusters,
colNameName="countyFIPS"
) %>%
colMutater(selfList=list("countyFIPS"=zeroPad5))
),
lstExtract=list("stateData"=function(x)
colSelector(x, vecSelect=c("countyFIPS", "pop")),
"dfPerCapita"=NULL,
"useClusters"=NULL
),
keyJoin="countyFIPS"
)
dfAggUSAF <- combineAggData(dfFullUSAF,
aggBy=list("agg1"=list(aggFunc=specNA(specSumProd),
aggVars=c("pop"),
wtVar=NULL,
prefix=NULL
),
"agg2"=list(aggFunc=specNA(weighted.mean),
aggVars=c("tcpm7", "tdpm7", "cpm7", "dpm7"),
wtVar="pop",
prefix="wm_"
)
)
)
# Helper function to make a summary map
helperSummaryMap <- function(df,
mapLevel="states",
keyCol="state",
values="cluster",
discreteValues=NULL,
legend.position="right",
labelScale=TRUE,
extraArgs=list(),
countOnly=FALSE,
textLabel=c(),
...
) {
# FUNCTION ARGUMENTS:
# df: a data frame containing a level of geography and an associated cluster
# mapLevel: a parameter for whether the map is "states" or "counties"
# keyCol: the key column for plotting (usmap::plot_usmap is particular, and this must be 'state' or 'fips')
# values: the character name of the field containing the data to be plotted
# discreteValues: boolean for whether the values are discrete (if not, use continuous)
# NULL means infer from data
# legend.position: character for the location of the legend in the plot
# labelScale: boolean, should an scale_fill_ be created? Use FALSE if contained in extraArgs
# extraArgs: list of other arguments that will be appended as '+' to the end of the usmap::plot_usmap call
# countOnly: should a bar plot of counts only be produced?
# textLabel: a list of elements that should be labelled as text on the plot (too small to see)
# ...: other parameters to be passed to usmap::plot_usmap (e.g., labels, include, exclude, etc.)
# Modify the data frame to contain only the relevant data
df <- df %>%
select(all_of(c(keyCol, values))) %>%
distinct()
# Determine the type of data being plotted
if (is.null(discreteValues)) discreteValues <- !is.numeric(df[[values]])
# Convert data type if needed
if (isTRUE(discreteValues) & is.numeric(df[[values]]))
df[[values]] <- factor(df[[values]])
# If count only is needed, create a count map; otherwise create a map
if (isTRUE(countOnly)) {
gg <- df %>%
ggplot(aes(x=fct_rev(get(values)))) +
geom_bar(aes_string(fill=values)) +
stat_count(aes(label=..count.., y=..count../2),
geom="text",
position="identity",
fontface="bold"
) +
coord_flip() +
labs(y="Number of members", x="")
} else {
if(keyCol=="countyFIPS") {
df <- df %>% colRenamer(vecRename=c("countyFIPS"="fips"))
keyCol <- "fips"
}
gg <- usmap::plot_usmap(regions=mapLevel, data=df, values=values, ...)
if (length(textLabel) > 0) {
labDF <- df %>%
filter(get(keyCol) %in% textLabel) %>%
mutate(rk=match(get(keyCol), textLabel)) %>%
arrange(rk) %>%
mutate(lon=-70.1-seq(0, 0.8*length(textLabel)-0.8, by=0.8),
lat=40.1-seq(0, 1.5*length(textLabel)-1.5, by=1.5)
) %>%
select(lon, lat, everything()) %>%
usmap::usmap_transform()
gg <- gg + geom_text(data=labDF,
aes(x=lon.1, y=lat.1, label=paste(get(keyCol), get(values))),
size=3.25
)
}
}
# Position the legend as requested
gg <- gg + theme(legend.position=legend.position)
# Create the scale if appropriate
if (isTRUE(labelScale)) gg <- gg +
if(isTRUE(discreteValues)) scale_fill_discrete(values) else scale_fill_continuous(values)
# Apply extra arguments
for (ctr in seq_along(extraArgs)) gg <- gg + extraArgs[[ctr]]
# Return the map object
gg
}
# Updated function for handling county-level clusters
createSummary <- function(df,
stateClusterDF=NULL,
brewPalette=NA,
dataType="state"
) {
# FUNCTION ARGUMENTS:
# df: an integrated data frame by cluster-date
# stateClusterDF: a data frame containing state-cluster (NULL means it can be found in df)
# brewPalette: character string for a palette from RColorBrewer to be used (NA means default colors)
# dataType: the type of maps being produced ("state" or "county")
# Create plots that can be relevant for a dashboard, including:
# 1. Map of segments
# 2. Bar plot of counts by segment
# 3. Facetted bar plot of segment descriptors (e.g., population, burden per million)
# 4. Facetted trend-line plot of burden by segments
# Create a map of the clusters
p1 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
textLabel=if(dataType=="state") c("RI", "CT", "DE", "MD", "DC") else c(),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create a bar plot of counts by segment
p2 <- helperSummaryMap(if(is.null(stateClusterDF)) df else stateClusterDF,
mapLevel=if(dataType=="state") "states" else "counties",
keyCol=if(dataType=="state") "state" else "countyFIPS",
discreteValues=TRUE,
labelScale=is.na(brewPalette),
countOnly=TRUE,
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for population and burden by cluster
p3 <- df %>%
helperAggTotal(aggVars=c("pop", "wm_tcpm7", "wm_tdpm7"),
mapper=c("pop"="Population (millions)",
"wm_tcpm7"="Cases per thousand",
"wm_tdpm7"="Deaths per million"
),
xLab=NULL,
yLab=NULL,
title=NULL,
divideBy=c("pop"=1000000, "wm_tcpm7"=1000),
extraArgs=if(is.na(brewPalette)) list() else
list("arg1"=scale_fill_brewer("Cluster", palette=brewPalette))
)
# Create plot for cumulative burden per million over time
p4xtra <- list(arg1=scale_x_date(date_breaks="2 months", date_labels="%b-%y"),
arg2=theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
)
if(!is.na(brewPalette)) p4xtra$arg3 <- scale_color_brewer("Cluster", palette=brewPalette)
p4 <- df %>%
helperAggTrend(aggVars=append(c("wm_tcpm7", "wm_tdpm7"), if(dataType=="state") "wm_hpm7" else NULL),
mapper=c("wm_tcpm7"="Cases per thousand\n(cumulative)",
"wm_tdpm7"="Deaths per million\n(cumulative)",
"wm_hpm7"="Hospitalized per million\n(current)"
),
yLab=NULL,
title=NULL,
divideBy=c("wm_tcpm7"=1000),
linesize=0.75,
extraArgs=p4xtra
)
list(p1=p1, p2=p2, p3=p3, p4=p4)
}
# Create the main summary plots
summaryPlotsUSAF <- createSummary(dfAggUSAF,
stateClusterDF=clustersToFrame(testFullUSAF_v001$useClusters,
colNameName="countyFIPS"
) %>%
colMutater(selfList=list("countyFIPS"=zeroPad5)),
brewPalette="Paired",
dataType="county"
)
# Create the detailed summaries
detPlotsUSAF <- createDetailedSummaries(dfDetail=dfFullUSAF,
dfAgg=dfAggUSAF,
detVar=c("countyFIPS"),
p2DetMetrics=c("tcpm7", "tdpm7", "cpm7", "dpm7"),
brewPalette="Paired"
)
# Print the summary plots if requested
if (isTRUE(TRUE)) {
gridExtra::grid.arrange(summaryPlotsUSAF$p1 + theme(legend.position="none"),
summaryPlotsUSAF$p3 + theme(legend.position="left"),
summaryPlotsUSAF$p4,
layout_matrix=rbind(c(1, 2),
c(3, 3)
)
)
}
# Print the detailed plots if requested
if (isTRUE(TRUE)) purrr::walk(detPlotsUSAF, .f=print)
## NULL
The plots appear to be created as expected. Next steps are to update the relevant functions so that plotting is an automatic component of data acquisition and clustering.
An updated version of diagnoseClusters() is created:
# Function to create diagnoses and plots for clustering data
diagnoseClusters <- function(lst,
lstExtract=fullListExtract,
clusterFrame=NULL,
brewPalette=NA,
clusterType="state",
printSummary=TRUE,
printDetailed=TRUE
) {
# FUNCTION ARGUMENTS:
# lst: a list containing processed clustering data
# lstExtract: the elements to extract from lst with an optional function for converting the elements
# NULL means use the extracted element as-is
# clusterFrame: tibble of the clusters to be plotted
# NULL means create from lst
# brewPalette: the color palette to use with scale_*_brewer()
# default NA means use the standard color/fill profile
# clusterType: character variable of form "state" for state clusters and "county" for county
# printSummary: boolean, should summary plots be printed to the log?
# printDetailed: boolean, should detailed plots be printed to the log?
# Get the key variable (used for joins and the like)
if (clusterType=="state") keyVar <- "state"
else if (clusterType=="county") keyVar <- "countyFIPS"
else stop(paste0("\nThe passed clusterType: ", clusterType, " is not programmed\n"))
# Create clusterFrame from lst if it has been passed as NULL
if (is.null(clusterFrame)) clusterFrame <- clustersToFrame(lst, colNameName=keyVar)
# Create the integrated and aggregate data from lst
dfFull <- integrateData(lst, lstExtract=lstExtract, otherDF=list(clusterFrame), keyJoin=keyVar)
dfAgg <- combineAggData(dfFull, aggBy=plotCombineAggByMapper[[clusterType]])
# Create the main summary plots
summaryPlots <- createSummary(dfAgg,
stateClusterDF=clusterFrame,
brewPalette=brewPalette,
dataType=clusterType
)
# Create the detailed summaries
detPlots <- createDetailedSummaries(dfDetail=dfFull,
dfAgg=dfAgg,
detVar=keyVar,
p2DetMetrics=plotCombineAggByMapper[[clusterType]]$agg2$aggVars,
brewPalette=brewPalette
)
# Print the summary plots if requested
if (isTRUE(printSummary)) {
gridExtra::grid.arrange(summaryPlots$p1 + theme(legend.position="none"),
summaryPlots$p3 + theme(legend.position="left"),
summaryPlots$p4,
layout_matrix=rbind(c(1, 2),
c(3, 3)
)
)
}
# Print the detailed plots if requested
if (isTRUE(printDetailed)) purrr::walk(detPlots, .f=print)
# Return a list of the key plotting files
list(dfFull=dfFull,
dfAgg=dfAgg,
plotClusters=clusterFrame,
summaryPlots=summaryPlots,
detPlots=detPlots
)
}
The diagnosis process is tested on the existing list:
testDiagnose <- diagnoseClusters(lst=list("stateData"=testFullUSAF_v002$countyData,
"dfPerCapita"=testFullUSAF_v002$dfPerCapita
),
lstExtract=list("stateData"=function(x)
colSelector(x, vecSelect=c("countyFIPS", "pop")),
"dfPerCapita"=NULL
),
clusterFrame=clustersToFrame(testFullUSAF_v001$useClusters,
colNameName="countyFIPS"
) %>%
colMutater(selfList=list("countyFIPS"=zeroPad5)),
brewPalette="Paired",
clusterType="county"
)
## NULL
The process appears to be working for county-level data. Next steps are to include the diagnoseClusters() function in readRunUSAFacts() so that the full process can be run in an integrated manner:
# Function to run the USA Facts (US county-level coronavirus data) clustering process
readRunUSAFacts <- function(maxDate,
downloadTo=list("usafCase"=NA, "usafDeath"=NA),
readFrom=downloadTo,
compareFile=list("usafCase"=NA, "usafDeath"=NA),
writeLog=NULL,
ovrwriteLog=TRUE,
dfPerCapita=NULL,
useClusters=NULL,
showBurdenMinPop=10000,
minPopCluster=25000,
defaultCluster=NULL,
hierarchical=FALSE,
kCut=6,
orderCluster=TRUE,
reAssignCounty=list(),
skipAssessmentPlots=FALSE,
brewPalette=NA,
...
) {
# FUNCTION ARGUMENTS:
# maxDate: the maximum data to use for data from the cases and deaths file
# downloadTo: named list for locations to download data (usafCase, usafDeath, usafPop)
# NA means do not download data for that particular element
# readFrom: named list for locations to read data from (defaults to donwloadTo)
# compareFile: named list for the reference file to be used for usafCase, usafDeath, usafPop
# NA means do not use a reference file for that element
# writeLog: name of a separate log file for capturing detailed data on changes between files
# NULL means no detailed data captured
# ovrwriteLog: boolean, should the log file be overwritten and started again from scratch?
# dfPerCapita: file can be passed directly, which bypasses the loading and processing steps
# default NULL means create dfPerCapita using steps 2-4
# useClusters: named vector containing the clusters to use
# NULL means create clusters from this data
# showBurdenMinPop: minimum population for showing in burden by cluster plots (NULL means skip plot)
# minPopCluster: minimum population for including county in running cluster-level metrics
# defaultCluster: cluster label to be assigned to any county that falls below minPopCluster
# NULL means do not add these to the clustering vector
# hierarchical: whether to create hierarchical clusters
# TRUE means run hierarchical clustering
# FALSE means run kmeans clustering
# NA means run rules-based clustering
# kCut; if hierarchical clustering is used, what k (number of clusters in cutree) should be used?
# orderCluster: if FALSE, ignore; if TRUE, order by "dpm"; if anything else, order by orderCluster
# reAssignCounty: mapping file for assigning a county to another county's cluster
# format list("countyToChange"="countyClusterToAssign")
# skipAssessmentPlots: boolean, should cluster assessment plots be skipped?
# brewPalette: character vector length-1 referencing a color scheme from brewer_pal to use
# NA means use R default color schemes
# ...: other arguments that will be passed to prepClusterCounties
# STEP 1: Get a county-level population file, with fips as 5-digit character and non-zero population
countyData <- getCountyData(selfList=list("countyFIPS"=zeroPad5), lstExclude=list("pop"=c(0)))
# If a log file is requested, create the log file (allows for append=TRUE for all downstream functions)
if (!is.null(writeLog)) genNewLog(writeLog=writeLog, ovrwriteLog=ovrwriteLog)
# Get the data types to be used (elements of readFrom) and create a file storage list
elemUsed <- names(readFrom)
dfRawList <- vector("list", length=length(elemUsed)) %>% purrr::set_names(elemUsed)
dfProcessList <- vector("list", length=length(elemUsed)) %>% purrr::set_names(elemUsed)
# Steps 2-4 are required only if dfPerCapita has not been passed
if (is.null(dfPerCapita)) {
# STEP 2: Download and QC each requested data element
for (elem in elemUsed) {
dfRawList[[elem]] <- readQCRawUSAF(fileName=readFrom[[elem]],
writeLog=writeLog,
ovrwriteLog=FALSE,
urlType=elem,
getData=if(is.na(downloadTo[[elem]])) FALSE else TRUE,
dfRef=compareFile[[elem]]
)
glimpseLog(dfRawList[[elem]], txt=paste0("\nRaw file for ", elem, ":\n"), logFile=writeLog)
}
# STEP 3: Process all requested data
for (elem in elemUsed) {
dfProcessList[[elem]] <- processRawFile(dfRawList[[elem]],
vecRename=c(),
vecSelect=vecSelectMapper[[elem]],
lstCombo=lstComboMapper[[elem]],
lstFilter=lstFilterMapper[[elem]],
lstExclude=lstExcludeMapper[[elem]]
)
glimpseLog(dfProcessList[[elem]], txt=paste0("\nProcessed for ", elem, ":\n"), logFile=writeLog)
}
# STEP 4: Integrate to create a per-capita data file
dfPerCapita <- createPerCapita(dfProcessList,
uqBy=c("countyFIPS", "state", "date"),
popData=countyData,
popJoinBy=c("countyFIPS", "state"),
mapper=perCapMapper
)
glimpseLog(dfPerCapita, txt="\nIntegrated per capita data file:\n", logFile=writeLog)
} else {
dfRawList <- NULL
dfProcessList <- NULL
}
# STEP 5: Create clusters (if passed as NULL)
if (is.null(useClusters)) {
# Add population back to dfPerCapita (should improve this process)
dfPerUse <- countyData %>%
select(countyFIPS, pop) %>%
mutate(countyFIPS=zeroPad5(countyFIPS)) %>%
right_join(dfPerCapita, by=c("countyFIPS"))
clData <- clusterCounties(dfPerCapita=dfPerUse,
hierarchical=hierarchical,
minPopCluster=minPopCluster,
...
)
useClusters <- getCountyClusters(clData,
hier=hierarchical,
kCut=kCut,
reAssign=reAssignCounty,
defaultCluster=defaultCluster
)
}
# STEP 6: Assess clusters
if (skipAssessmentPlots) {
plotDataList <- NULL
} else {
lstFuns <- list("stateData"=function(x) colSelector(x, vecSelect=c("countyFIPS", "pop")),
"dfPerCapita"=NULL
)
clFrame <- useClusters %>%
clustersToFrame(colNameName="countyFIPS") %>%
colMutater(selfList=list("countyFIPS"=zeroPad5))
plotDataList <- diagnoseClusters(lst=list("stateData"=countyData,
"dfPerCapita"=dfPerCapita
),
lstExtract=lstFuns,
clusterFrame=clFrame,
brewPalette=brewPalette,
clusterType="county"
)
}
# Return statement, still need to update Step 6 (cluster assessment)
return(list(countyData=countyData,
dfRaw=dfRawList,
dfProcess=dfProcessList,
dfPerCapita=dfPerCapita,
useClusters=useClusters,
maxDate=maxDate,
plotDataList=plotDataList
)
)
}
The full function is then tested, using the existing October 26 data. This requires an update to clusterCounties() as some of the data formats have changed:
# Function to take county-level data, prepare for clusterStates, and return resulting outputs
clusterCounties <- function(dfPerCapita,
hierarchical,
vecRename=c(),
clusterBy=c("countyFIPS"),
arrangeBy=c("date"),
burdenMetrics=c("cpm", "dpm"),
popVar=c("pop"),
vecSelect=c(clusterBy, arrangeBy, burdenMetrics, popVar),
uniqueBy=c(clusterBy, arrangeBy),
minPopCluster=1,
returnList=TRUE,
...
) {
# FUNCTION ARGUMENTS:
# dfPerCapita: a county-level file with per-capita metrics
# hierarchical: whether to create hierarchical clusters
# TRUE means run hierarchical clustering
# FALSE means run kmeans clustering
# NA means run rules-based clustering
# vecRename: renaming of input variables
# clusterBy: the variable name used for clustering
# arrangeBy: data will be sorted by this a mix of clusterBy and this variable
# burdenMetrics: the metrics to be used for burden in clustering
# popVar: the column containing population data
# vecSelect: selection of input variables
# uniqueBy: the input file must be unique by, and will then be sorted by, uniqueBy
# minPopCluster: minimum population for including county in running cluster-level metrics
# returnList: boolean, if FALSE just the cluster object is returned
# if TRUE, a list is returned with dfCluster and the cluster object
# ...: other arguments that will be passed to clusterStates
# STEP 1: Select and rename variables from the dfPerCapita file
countyData <- dfPerCapita %>%
colRenamer(vecRename=vecRename) %>%
colSelector(vecSelect=vecSelect) %>%
checkUniqueRows(uniqueBy=uniqueBy, returnDF=TRUE) %>%
arrange(across(all_of(uniqueBy))) %>%
mutate(popThresh=(get(popVar)>=minPopCluster))
# STEP 2: Split data based on population threshold
countyFiltered <- countyData %>% filter(popThresh)
countyBelow <- countyData %>% filter(!popThresh)
# STEP 2a: Confirm that no county is in both data sets
count(countyFiltered, a=get(clusterBy), popThresh) %>%
bind_rows(count(countyBelow, a=get(clusterBy), popThresh)) %>%
checkUniqueRows(uniqueBy=c("a"), returnDF=FALSE, noteUnique=FALSE)
# STEP 3: Run county-level clusters
objCluster <- clusterStates(colRenamer(countyFiltered, vecRename=c("countyFIPS"="state")), # should fix
hierarchical=hierarchical,
returnList=returnList,
...
)
# Return all of the relevant objects
list(objCluster=objCluster,
countyFiltered=countyFiltered,
countyBelow=countyBelow
)
}
# Function to obtain county clusters and return the county clusters vector
getCountyClusters <- function(obj,
hierarchical=FALSE,
kCut=0,
reAssign=list(),
defaultCluster=NULL
) {
# FUNCTION ARGUMENTS
# obj: a clustering object returned by clusterCounties()
# hierarchical: whether the clustering object is based on hierarchical clusters
# TRUE means from hierarchical clustering
# FALSE means from kmeans clustering
# NA means from rules-based clustering
# kCut; if hierarchical clustering is used, what k (number of clusters in cutree) should be used?
# reAssign: mapping file to change segments, as list('entity'='other entity cluster to use')
# defaultCluster: cluster label to be assigned to any county that is not in obj$objCluster
# NULL means do not add these to the clustering vector
# Get the clusters from obj$objCluster
clust <- getClusters(obj$objCluster, hier=hierarchical, kCut=kCut, reAssign=reAssign)
# Add the defaultCluster label to any county that does not have a cluster label
if (!is.null(defaultCluster)) {
ctyAdd <- obj$countyBelow %>% pull(countyFIPS) %>% unique() %>% sort()
vecAdd <- rep(defaultCluster, length(ctyAdd)) %>% purrr::set_names(ctyAdd)
clust <- c(clust, vecAdd)
}
# Return the cluster vector
clust
}
readList <- list("usafCase"="./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201026.csv",
"usafDeath"="./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201026.csv"
)
compareList <- list("usafCase"=testDFRefCase,
"usafDeath"=testDFRefDeath
)
# Run using existing clusters
testFullUSAF_v003 <- readRunUSAFacts(maxDate=NA,
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_Daily_Test_v003.log",
ovrwriteLog=TRUE,
useClusters=testFullUSAF_v001$useClusters,
skipAssessmentPlots=FALSE,
brewPalette="Paired"
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201026.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 39
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_Daily_Test_v003.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_Daily_Test_v003.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 145 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_Daily_Test_v003.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201026.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 39
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_Daily_Test_v003.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_Daily_Test_v003.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 68 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_Daily_Test_v003.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 7.90e+8 8478790 885015
## 2 after 7.85e+8 8437719 870888
## 3 pctchg 6.42e-3 0.00484 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 2.76e+7 222573 885015
## 2 after 2.74e+7 221995 870888
## 3 pctchg 6.43e-3 0.00260 0.0160
## NULL
# Expected differences in useClusters (they exist) and plotDataList (they exist and have an environment)
sapply(names(testFullUSAF_v003), FUN=function(x) identical(testFullUSAF_v003[[x]], testFullUSAF_v002[[x]]))
## countyData dfRaw dfProcess dfPerCapita useClusters maxDate
## TRUE TRUE TRUE TRUE FALSE TRUE
## plotDataList
## FALSE
# Create new clusters
testFullUSAF_v004 <- readRunUSAFacts(maxDate=NA,
downloadTo=lapply(readList, FUN=function(x) if(file.exists(x)) NA else x),
readFrom=readList,
compareFile=compareList,
writeLog="./RInputFiles/Coronavirus/USAF_Daily_Test_v004.log",
ovrwriteLog=TRUE,
useClusters=NULL,
skipAssessmentPlots=FALSE,
brewPalette="Paired",
defaultCluster="999",
minPopCluster=25000,
hierarchical=NA,
minShape="2020-04",
maxShape="2020-09",
ratioDeathvsCase = 0.001,
ratioTotalvsShape = 0.25,
minDeath=100,
minCase=5000,
hmlSegs=3,
eslSegs=3,
seed=2010261358
)
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_confirmed_usafacts_downloaded_20201026.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 39
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_Daily_Test_v004.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_Daily_Test_v004.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 145 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_Daily_Test_v004.log
##
##
## No file has been downloaded, will use existing file: ./RInputFiles/Coronavirus/covid_deaths_usafacts_downloaded_20201026.csv
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## `County Name` = col_character(),
## State = col_character()
## )
## i Use `spec()` for the full column specifications.
##
## *** File has been checked for uniqueness by: countyFIPS countyName state stateFIPS
##
##
## *** File has been checked for uniqueness by: countyFIPS stateFIPS date
##
##
## Checking for similarity of: column names
## In reference but not in current:
## In current but not in reference:
##
## Checking for similarity of: date
## In reference but not in current: 0
## In current but not in reference: 39
## Detailed differences available in: ./RInputFiles/Coronavirus/USAF_Daily_Test_v004.log
##
## Checking for similarity of: county
## In reference but not in current:
## In current but not in reference:
##
##
## ***Differences of at least 5 and at least 5%
##
## 0 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_Daily_Test_v004.log
##
##
## ***Differences of at least 0 and at least 0.1%
##
## 68 records
## Detailed output available in log: ./RInputFiles/Coronavirus/USAF_Daily_Test_v004.log
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType cases new_cases n
## <chr> <dbl> <dbl> <dbl>
## 1 before 7.90e+8 8478790 885015
## 2 after 7.85e+8 8437719 870888
## 3 pctchg 6.42e-3 0.00484 0.0160
##
##
## Column sums before and after applying filtering rules:
## # A tibble: 3 x 4
## isType deaths new_deaths n
## <chr> <dbl> <dbl> <dbl>
## 1 before 2.76e+7 222573 885015
## 2 after 2.74e+7 221995 870888
## 3 pctchg 6.43e-3 0.00260 0.0160
##
##
## *** File has been checked for uniqueness by: countyFIPS date
## NULL
# Expected differences in useClusters (different data) and plotDataList (different data and environment)
sapply(names(testFullUSAF_v004), FUN=function(x) identical(testFullUSAF_v004[[x]], testFullUSAF_v003[[x]]))
## countyData dfRaw dfProcess dfPerCapita useClusters maxDate
## TRUE TRUE TRUE TRUE FALSE TRUE
## plotDataList
## FALSE
The updated functions appear to be working as intended. Next steps are to download the latest data and refresh the segments.